刘凡 před 2 roky
revize
cb355e1311
100 změnil soubory, kde provedl 20618 přidání a 0 odebrání
  1. 4 0
      .gitignore
  2. 3 0
      .idea/.gitignore
  3. 15 0
      .idea/Data.iml
  4. 26 0
      .idea/inspectionProfiles/Project_Default.xml
  5. 6 0
      .idea/inspectionProfiles/profiles_settings.xml
  6. 4 0
      .idea/misc.xml
  7. 8 0
      .idea/modules.xml
  8. 27 0
      .idea/vcs.xml
  9. 3 0
      Azure/.vscode/settings.json
  10. 82 0
      Azure/AddUp/Azure-blob-storage.py
  11. 51 0
      Azure/AddUp/blob-upload-1.py
  12. 221 0
      Azure/AddUp/blob-upload-2.py
  13. 107 0
      Azure/AddUp/blob-upload.py
  14. 231 0
      Azure/AddUp/circuitbreaker.py
  15. 138 0
      Azure/AddUp/datafactory.py
  16. 202 0
      Azure/AddUp/file_advanced_samples.py
  17. 190 0
      Azure/AddUp/file_basic_samples.py
  18. 415 0
      Azure/AddUp/python-quick-start.py
  19. 218 0
      Azure/AddUp/table_advanced_samples.py
  20. 96 0
      Azure/AddUp/table_basic_samples.py
  21. 1 0
      Azure/AzureStorage
  22. 125 0
      Azure/DLfile.py
  23. 1 0
      Azure/azure-multiapi-storage-python
  24. 64 0
      Azure/blob-adapter.py
  25. 98 0
      Azure/blob-permission.py
  26. 101 0
      Azure/blob-upload-1.py
  27. 81 0
      Azure/blob-upload-2.py
  28. 57 0
      Azure/blob-upload-3.py
  29. 67 0
      Azure/blob-upload-4.py
  30. 107 0
      Azure/blob-upload.py
  31. 221 0
      Azure/django-blob.py
  32. 1 0
      Azure/python-text-classification
  33. 555 0
      Azure/storage-blob.py
  34. 130 0
      Azure/table-service.py
  35. 218 0
      Azure/table-storage.py
  36. 47 0
      BI/BIL.py
  37. 1 0
      BI/BusinessIntelligence-Kaggle
  38. 606 0
      BI/ID3_classification.py
  39. 336 0
      BI/Practica2.py
  40. 132 0
      BI/apriori.py
  41. 440 0
      BI/bi_main.py
  42. 727 0
      BI/cube-backup.py
  43. 727 0
      BI/cube.py
  44. 197 0
      BI/etl_testing.py
  45. 33 0
      BI/examples/__init__.py
  46. 63 0
      BI/examples/bart_lines.py
  47. 763 0
      BI/examples/birth_names.py
  48. 373 0
      BI/examples/countries.md
  49. 2505 0
      BI/examples/countries.py
  50. 114 0
      BI/examples/country_map.py
  51. 100 0
      BI/examples/css_templates.py
  52. 529 0
      BI/examples/deck.py
  53. 141 0
      BI/examples/energy.py
  54. 68 0
      BI/examples/flights.py
  55. 78 0
      BI/examples/helpers-backup.py
  56. 78 0
      BI/examples/helpers.py
  57. 116 0
      BI/examples/long_lat.py
  58. 224 0
      BI/examples/misc_dashboard-backup.py
  59. 224 0
      BI/examples/misc_dashboard.py
  60. 58 0
      BI/examples/multi_line.py
  61. 117 0
      BI/examples/multiformat_time_series.py
  62. 60 0
      BI/examples/paris.py
  63. 81 0
      BI/examples/random_time_series.py
  64. 62 0
      BI/examples/sf_population_polygons.py
  65. 342 0
      BI/examples/tabbed_dashboard-backup.py
  66. 342 0
      BI/examples/tabbed_dashboard.py
  67. 163 0
      BI/examples/unicode_test_data.py
  68. 574 0
      BI/examples/world_bank.py
  69. 580 0
      BI/income_disparity_final_version_2.py
  70. 338 0
      BI/macro_analysis-backup.py
  71. 338 0
      BI/macro_analysis.py
  72. 662 0
      BI/practica3.py
  73. 98 0
      Directory/IOTA2Directory.py
  74. 31 0
      Directory/advance_touch.py
  75. 213 0
      Directory/augmentation_main.py
  76. 92 0
      Directory/conftest.py
  77. 394 0
      Directory/data_preprocessing_utils.py
  78. 122 0
      Directory/diml_to_interiornet.py
  79. 177 0
      Directory/ego_to_json.py
  80. 107 0
      Directory/esquema.py
  81. 41 0
      Directory/file_handler.py
  82. 130 0
      Directory/generate_directories.py
  83. 167 0
      Directory/logging.py
  84. 27 0
      Directory/make_folder.py
  85. 90 0
      Directory/mkdir.py
  86. 135 0
      Directory/mkdirPypi.py
  87. 12 0
      Directory/mkdir_p.py
  88. 80 0
      Directory/project_creator.py
  89. 206 0
      Directory/setup.py
  90. 49 0
      Directory/split_data_in_k_folds.py
  91. 80 0
      Directory/stc_vid2frames.py
  92. 197 0
      Directory/test_archive.py
  93. 306 0
      Directory/test_tool.py
  94. 272 0
      Directory/tutorial.py
  95. 52 0
      Directory/utils.py
  96. 632 0
      Hash/EncrypC.py
  97. 139 0
      Hash/EncryptionDecryption.py
  98. 1 0
      Hash/Encryption_And_Hashing
  99. 70 0
      Hash/base64.py
  100. 485 0
      Hash/biometry_hash.py

+ 4 - 0
.gitignore

@@ -0,0 +1,4 @@
+.DS_Store
+*.pyc
+__pycache__
+.git

+ 3 - 0
.idea/.gitignore

@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml

+ 15 - 0
.idea/Data.iml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="py.test" />
+  </component>
+</module>

+ 26 - 0
.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,26 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="6">
+            <item index="0" class="java.lang.String" itemvalue="requests" />
+            <item index="1" class="java.lang.String" itemvalue="Flask" />
+            <item index="2" class="java.lang.String" itemvalue="tqdm" />
+            <item index="3" class="java.lang.String" itemvalue="tensorboardX" />
+            <item index="4" class="java.lang.String" itemvalue="torch" />
+            <item index="5" class="java.lang.String" itemvalue="numpy" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N801" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

+ 6 - 0
.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 4 - 0
.idea/misc.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (base) (2)" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Data.iml" filepath="$PROJECT_DIR$/.idea/Data.iml" />
+    </modules>
+  </component>
+</project>

+ 27 - 0
.idea/vcs.xml

@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/Azure/AzureStorage" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/Azure/azure-multiapi-storage-python" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/Azure/python-text-classification" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/BI/BusinessIntelligence-Kaggle" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/Hash/Encryption_And_Hashing" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/ML/Machine_Learning_and_Having_It_Deep_and_Structured" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/NATS/NatsExample" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/NATS/asyncio-nats-examples" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/Pseudonym/Data-Masking" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/S3/NatsExample" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/S3/odoo-s3-storage" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/S3/s3-concat" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/archive/auto-archiver" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/kafka/Calories-Alert-Kafka" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/kafka/MessageCorps" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/kafka/ai-project-fraud-detection" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/kafka/kafka-fraud-detector" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/kafka/kafkaesk" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/kafka/scrapy-kafka" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/kafka/tail2kafka" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/visualize/Visualization-of-popular-algorithms-in-Python" vcs="Git" />
+  </component>
+</project>

+ 3 - 0
Azure/.vscode/settings.json

@@ -0,0 +1,3 @@
+{
+    "python.formatting.provider": "black"
+}

+ 82 - 0
Azure/AddUp/Azure-blob-storage.py

@@ -0,0 +1,82 @@
+from azure.storage.blob import BlobClient, BlobServiceClient
+import os
+import requests
+
+def list_files() -> list:
+    file_list = []
+    
+    for root, dirs, files in os.walk("data"):
+        for name in files:
+            file_list.append({"file_name": name, "local_path": os.path.join(root,name)})
+
+    return file_list
+
+def get_filename_from_url(url: str) -> str:
+    file_name=url.split('/')[-1]
+    return file_name
+
+def get_random_images() -> list:
+    # helper function uses loremflickr.com to get a random list of images 
+    images = []
+
+    for i in range(10):
+        resp = requests.get(url=f"https://loremflickr.com/json/320/240?random={i}")
+        resp_json = resp.json()
+        images.append(resp_json["file"])
+
+    return images
+
+def create_blob_from_url(storage_connection_string,container_name):
+    try:
+        # urls to fetch into blob storage
+        url_list = get_random_images()
+
+        # Instantiate a new BlobServiceClient and a new ContainerClient
+        blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string)
+        container_client = blob_service_client.get_container_client(container_name)
+
+        for u in url_list:
+            # Download file from url then upload blob file
+            r = requests.get(u, stream = True)
+            if r.status_code == 200:
+                r.raw.decode_content = True
+                blob_client = container_client.get_blob_client(get_filename_from_url(u))
+                blob_client.upload_blob(r.raw,overwrite=True)
+        return True
+        
+    except Exception as e:
+        print(e.message, e.args)
+        return False 
+
+def create_blob_from_path(storage_connection_string,container_name):
+    try:
+        # Instantiate a new BlobServiceClient and a new ContainerClient
+        blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string)
+        container_client = blob_service_client.get_container_client(container_name)
+
+        for f in list_files():
+            with open(f["local_path"], "rb") as data:
+                blob_client = container_client.get_blob_client(f["file_name"])
+                blob_client.upload_blob(data,overwrite=True)
+        return True
+
+    except Exception as e:
+        print(e.message, e.args)
+        return False
+
+if __name__ == '__main__':
+
+    # get storage account settings
+    storage_connection_string = os.environ.get("STORAGE_CONNECTION_STRING")
+    container_name = os.environ.get("STORAGE_CONTAINER")
+
+    # # if you want to copy from a public url
+    result = create_blob_from_url(storage_connection_string,container_name)
+    
+    # OR if you want to upload form your local drive
+    #create_blob_from_path(storage_connection_string,container_name)
+
+    if(result):
+        print("Done!")
+    else:
+        print("An error occured!")

+ 51 - 0
Azure/AddUp/blob-upload-1.py

@@ -0,0 +1,51 @@
+import os
+from flask import Flask, request, redirect, url_for
+from werkzeug import secure_filename
+from azure.storage.blob import BlockBlobService
+import string, random, requests
+
+app = Flask(__name__, instance_relative_config=True)
+
+app.config.from_pyfile('config.py')
+account = app.config['ACCOUNT']   # Azure account name
+key = app.config['STORAGE_KEY']      # Azure Storage account access key  
+container = app.config['CONTAINER'] # Container name
+
+blob_service = BlockBlobService(account_name=account, account_key=key)
+
+@app.route('/', methods=['GET', 'POST'])
+def upload_file():
+    if request.method == 'POST':
+        file = request.files['file']
+        filename = secure_filename(file.filename)
+        fileextension = filename.rsplit('.',1)[1]
+        Randomfilename = id_generator()
+        filename = Randomfilename + '.' + fileextension
+        try:
+            blob_service.create_blob_from_stream(container, filename, file)
+        except Exception:
+            print ('Exception=' + Exception)
+            pass
+        ref =  'http://'+ account + '.blob.core.windows.net/' + container + '/' + filename
+        return '''
+	    <!doctype html>
+	    <title>File Link</title>
+	    <h1>Uploaded File Link</h1>
+	    <p>''' + ref + '''</p>
+	    <img src="'''+ ref +'''">
+	    '''
+    return '''
+    <!doctype html>
+    <title>Upload new File</title>
+    <h1>Upload new File</h1>
+    <form action="" method=post enctype=multipart/form-data>
+      <p><input type=file name=file>
+         <input type=submit value=Upload>
+    </form>
+    '''
+
+def id_generator(size=32, chars=string.ascii_uppercase + string.digits):
+    return ''.join(random.choice(chars) for _ in range(size))
+
+if __name__ == '__main__':
+    app.run(debug=True)

+ 221 - 0
Azure/AddUp/blob-upload-2.py

@@ -0,0 +1,221 @@
+import mimetypes
+import datetime
+
+from azure.common import AzureMissingResourceHttpError
+from azure.storage.blob import BlobService
+
+from django.core.files.storage import Storage
+from django.conf import settings
+
+try:
+    from django.utils.deconstruct import deconstructible
+except ImportError:
+    # Support for django 1.7 and below
+    def deconstructible(func):
+        return func
+
+
+@deconstructible
+class AzureStorage(Storage):
+    """
+    Custom file storage system for Azure
+    """
+
+    container = settings.AZURE_STORAGE.get('CONTAINER')
+    account_name = settings.AZURE_STORAGE.get('ACCOUNT_NAME')
+    account_key = settings.AZURE_STORAGE.get('ACCOUNT_KEY')
+    cdn_host = settings.AZURE_STORAGE.get('CDN_HOST')
+    use_ssl = settings.AZURE_STORAGE.get('USE_SSL')
+
+    def __init__(self, account_name=None, account_key=None, container=None,
+         use_ssl=None, cdn_host=None):
+
+        if account_name is not None:
+            self.account_name = account_name
+
+        if account_key is not None:
+            self.account_key = account_key
+
+        if container is not None:
+            self.container = container
+
+        if use_ssl is not None:
+            self.use_ssl = use_ssl
+
+        if cdn_host is not None:
+            self.cdn_host = cdn_host
+
+    def __getstate__(self):
+        return dict(
+            account_name=self.account_name,
+            account_key=self.account_key,
+            container=self.container,
+            cdn_host=self.cdn_host,
+            use_ssl=self.use_ssl
+        )
+
+    def _get_service(self):
+        if not hasattr(self, '_blob_service'):
+            self._blob_service = BlobService(
+                account_name=self.account_name,
+                account_key=self.account_key,
+                protocol='https' if self.use_ssl else 'http'
+            )
+
+        return self._blob_service
+
+    def _get_properties(self, name):
+        return self._get_service().get_blob_properties(
+            container_name=self.container,
+            blob_name=name
+        )
+
+    def _open(self, name, mode='rb'):
+        """
+        Return the AzureStorageFile.
+        """
+
+        from django.core.files.base import ContentFile
+
+        contents = self._get_service().get_blob_to_bytes(
+            container_name=self.container,
+            blob_name=name
+        )
+
+        return ContentFile(contents)
+
+    def _save(self, name, content):
+        """
+        Use the Azure Storage service to write ``content`` to a remote file
+        (called ``name``).
+        """
+        
+
+        content.open()
+
+        content_type = None
+
+        if hasattr(content.file, 'content_type'):
+            content_type = content.file.content_type
+        else:
+            content_type = mimetypes.guess_type(name)[0]
+
+        cache_control = self.get_cache_control(
+            self.container,
+            name,
+            content_type
+        )
+
+        self._get_service().put_block_blob_from_file(
+            container_name=self.container,
+            blob_name=name,
+            stream=content,
+            x_ms_blob_content_type=content_type,
+            cache_control=cache_control,
+            x_ms_blob_cache_control=cache_control
+        )
+
+        content.close()
+
+        return name
+
+    def listdir(self, path):
+        """
+        Lists the contents of the specified path, returning a 2-tuple of lists;
+        the first item being directories, the second item being files.
+        """
+
+        files = []
+
+        if path and not path.endswith('/'):
+            path = '%s/' % path
+
+        path_len = len(path)
+
+        if not path:
+            path = None
+
+        blob_list = self._get_service().list_blobs(self.container, prefix=path)
+
+        for name in blob_list:
+            files.append(name[path_len:])
+
+        return ([], files)
+
+    def exists(self, name):
+        """
+        Returns True if a file referenced by the given name already exists in
+        the storage system, or False if the name is available for a new file.
+        """
+        try:
+            self._get_properties(name)
+
+            return True
+        except AzureMissingResourceHttpError:
+            return False
+
+    def delete(self, name):
+        """
+        Deletes the file referenced by name.
+        """
+
+        try:
+            self._get_service().delete_blob(self.container, name)
+        except AzureMissingResourceHttpError:
+            pass
+
+    def get_cache_control(self, container, name, content_type):
+        """
+        Get the Cache-Control value for a blob, used when saving the blob on
+        Azure.  Returns `None` by default to remain compatible with the
+        default setting for the SDK.
+        """
+
+        return None
+
+    def size(self, name):
+        """
+        Returns the total size, in bytes, of the file referenced by name.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return int(properties['content-length'])
+        except AzureMissingResourceHttpError:
+            pass
+
+    def url(self, name):
+        """
+        Returns the URL where the contents of the file referenced by name can
+        be accessed.
+        """
+
+        blob_url_args = {
+            'container_name': self.container,
+            'blob_name': name,
+        }
+
+        if self.cdn_host:
+            # The account name should be built into the cdn hostname
+            blob_url_args['account_name'] = ''
+            blob_url_args['host_base'] = self.cdn_host
+
+        return self._get_service().make_blob_url(
+            **blob_url_args
+        )
+
+    def modified_time(self, name):
+        """
+        Returns a datetime object containing the last modified time.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return datetime.datetime.strptime(
+                properties['last-modified'],
+                '%a, %d %b %Y %H:%M:%S %Z'
+            )
+        except AzureMissingResourceHttpError:
+            pass

+ 107 - 0
Azure/AddUp/blob-upload.py

@@ -0,0 +1,107 @@
+# ----------------------------------------------------------------------------------
+# MIT License
+#
+# Copyright(c) Microsoft Corporation. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# ----------------------------------------------------------------------------------
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+
+import os
+import uuid
+import sys
+from azure.storage.blob import BlockBlobService, PublicAccess
+
+# ---------------------------------------------------------------------------------------------------------
+# Method that creates a test file in the 'Sample' folder.
+# This sample application creates a test file, uploads the test file to the Blob storage,
+# lists the blobs in the container, and downloads the file with a new name.
+# ---------------------------------------------------------------------------------------------------------
+# Documentation References:
+# Associated Article - https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python
+# What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+# Getting Started with Blobs-https://docs.microsoft.com/en-us/azure/storage/blobs/storage-python-how-to-use-blob-storage
+# Blob Service Concepts - http://msdn.microsoft.com/en-us/library/dd179376.aspx
+# Blob Service REST API - http://msdn.microsoft.com/en-us/library/dd135733.aspx
+# ----------------------------------------------------------------------------------------------------------
+
+
+def run_sample():
+    try:
+        # Create the BlockBlobService that is used to call the Blob service for the storage account
+        blob_service_client = BlockBlobService(
+            account_name='accountname', account_key='accountkey')
+
+        # Create a container called 'quickstartblobs'.
+        container_name = 'quickstartblobs'
+        blob_service_client.create_container(container_name)
+
+        # Set the permission so the blobs are public.
+        blob_service_client.set_container_acl(
+            container_name, public_access=PublicAccess.Container)
+
+        # Create Sample folder if it not exists, and create a file in folder Sample to test the upload and download.
+        local_path = os.path.expanduser("~/Sample")
+        if not os.path.exists(local_path):
+            os.makedirs(os.path.expanduser("~/Sample"))
+        local_file_name = "QuickStart_" + str(uuid.uuid4()) + ".txt"
+        full_path_to_file = os.path.join(local_path, local_file_name)
+
+        # Write text to the file.
+        file = open(full_path_to_file,  'w')
+        file.write("Hello, World!")
+        file.close()
+
+        print("Temp file = " + full_path_to_file)
+        print("\nUploading to Blob storage as blob" + local_file_name)
+
+        # Upload the created file, use local_file_name for the blob name
+        blob_service_client.create_blob_from_path(
+            container_name, local_file_name, full_path_to_file)
+
+        # List the blobs in the container
+        print("\nList blobs in the container")
+        generator = blob_service_client.list_blobs(container_name)
+        for blob in generator:
+            print("\t Blob name: " + blob.name)
+
+        # Download the blob(s).
+        # Add '_DOWNLOADED' as prefix to '.txt' so you can see both files in Documents.
+        full_path_to_file2 = os.path.join(local_path, str.replace(
+            local_file_name ,'.txt', '_DOWNLOADED.txt'))
+        print("\nDownloading blob to " + full_path_to_file2)
+        blob_service_client.get_blob_to_path(
+            container_name, local_file_name, full_path_to_file2)
+
+        sys.stdout.write("Sample finished running. When you hit <any key>, the sample will be deleted and the sample "
+                         "application will exit.")
+        sys.stdout.flush()
+        input()
+
+        # Clean up resources. This includes the container and the temp files
+        blob_service_client.delete_container(container_name)
+        os.remove(full_path_to_file)
+        os.remove(full_path_to_file2)
+    except Exception as e:
+        print(e)
+
+
+# Main method.
+if __name__ == '__main__':
+    run_sample()

+ 231 - 0
Azure/AddUp/circuitbreaker.py

@@ -0,0 +1,231 @@
+# ----------------------------------------------------------------------------------
+# MIT License
+#
+# Copyright(c) Microsoft Corporation. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# ----------------------------------------------------------------------------------
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ----------------------------------------------------------------------------------
+
+import os
+import uuid
+import time
+import sys
+from azure.storage.blob import BlockBlobService
+from azure.storage.common.models import LocationMode
+from azure.storage.common.retry import LinearRetry
+
+
+# ----------------------------------------------------------------------------------
+# Azure Storage Circuit Breaker Demo
+# INSTRUCTIONS
+# Please see the README.md file for an overview explaining this application and how to run it.
+# ----------------------------------------------------------------------------------
+# Documentation References:
+# Associated Article - https://docs.microsoft.com/en-us/azure/storage/blobs/storage-create-geo-redundant-storage-python
+# Designing HA Apps with RA-GRS storage -https://docs.microsoft.com/azure/storage/storage-designing-ha-apps-with-ra-grs/
+# Getting Started with Blobs-https://docs.microsoft.com/en-us/azure/storage/blobs/storage-python-how-to-use-blob-storage
+# Azure Storage Replication - https://docs.microsoft.com/azure/storage/storage-redundancy
+# ----------------------------------------------------------------------------------
+
+account_name = os.environ.get('accountname')
+account_key = os.environ.get('accountkey')
+
+# Track how many times retry events occur.
+retry_count = 0  # Number of retries that have occurred
+retry_threshold = 5  # Threshold number of retries before switching to secondary
+secondary_read_count = 0  # Number of reads from secondary that have occurred
+secondary_threshold = 20  # Threshold number of reads from secondary before switching back to primary
+
+# This is the CloudBlobClient object used to access the blob service
+blob_client = None
+
+# This is the container used to store and access the blob to be used for testing
+container_name = None
+
+'''
+Main method. Sets up the objects needed, the performs a loop to perform blob
+ operation repeatedly, responding to the Retry and Response Received events.
+'''
+
+
+def run_circuit_breaker():
+    # Name of image to use for testing.
+    image_to_upload = "HelloWorld.png"
+
+    global blob_client
+    global container_name
+    try:
+
+        # Create a reference to the blob client and container using the storage account name and key
+        blob_client = BlockBlobService(account_name, account_key)
+
+        # Make the container unique by using a UUID in the name.
+        container_name = "democontainer" + str(uuid.uuid4())
+        blob_client.create_container(container_name)
+
+    except Exception as ex:
+        print("Please make sure you have put the correct storage account name and key.")
+        print(ex)
+
+    # Define a reference to the actual blob and upload the block_blob to the newly created container
+    full_path_to_file = os.path.join(os.path.dirname(__file__), image_to_upload)
+    blob_client.create_blob_from_path(container_name, image_to_upload, full_path_to_file)
+
+    # Set the location mode to secondary, so you can check just the secondary data center.
+    blob_client.location_mode = LocationMode.SECONDARY
+    blob_client.retry = LinearRetry(backoff=0).retry
+
+    # Before proceeding, wait until the blob has been replicated to the secondary data center.
+    # Loop and check for the presence of the blob once in a second until it hits 60 seconds
+    # or until it finds it
+    counter = 0
+    while counter < 60:
+        counter += 1
+        sys.stdout.write("\nAttempt {0} to see if the blob has replicated to the secondary storage yet.".format(counter))
+        sys.stdout.flush()
+        if blob_client.exists(container_name, image_to_upload):
+            break
+
+        # Wait a second, then loop around and try again
+        # When it's finished replicating to the secondary, continue.
+        time.sleep(1)
+
+    # Set the starting LocationMode to Primary, then Secondary.
+    # Here we use the linear retry by default, but allow it to retry to secondary if
+    # the initial request to primary fails.
+    # Note that the default is Primary. You must have RA-GRS enabled to use this
+    blob_client.location_mode = LocationMode.PRIMARY
+    blob_client.retry = LinearRetry(max_attempts=retry_threshold, backoff=1).retry
+
+    ''' 
+        ************INSTRUCTIONS**************k
+        To perform the test, first replace the 'accountname' and 'accountkey' with your storage account name and key.
+        Every time it calls get_blob_to_path it will hit the response_callback function.
+
+        Next, run this app. While this loop is running, pause the program by pressing any key, and
+        put the intercept code in Fiddler (that will intercept and return a 503).
+
+        For instructions on modifying Fiddler, look at the Fiddler_script.text file in this project.
+        There are also full instructions in the ReadMe_Instructions.txt file included in this project.
+
+        After adding the custom script to Fiddler, calls to primary storage will fail with a retryable
+        error which will trigger the Retrying event (above).
+        Then it will switch over and read the secondary. It will do that 20 times, then try to
+        switch back to the primary.
+        After seeing that happen, pause this again and remove the intercepting Fiddler code
+        Then you'll see it return to the primary and finish.
+        '''
+
+    print("\n\nThe application will pause at 200 unit interval")
+
+    for i in range(0, 1000):
+        if blob_client.location_mode == LocationMode.SECONDARY:
+            sys.stdout.write("S{0} ".format(str(i)))
+        else:
+            sys.stdout.write("P{0} ".format(str(i)))
+        sys.stdout.flush()
+
+        try:
+
+            # These function is called immediately after retry evaluation is performed.
+            # It is used to trigger the change from primary to secondary and back
+            blob_client.retry_callback = retry_callback
+
+            # Download the file
+            blob_client.get_blob_to_path(container_name, image_to_upload,
+                                                str.replace(full_path_to_file, ".png", "Copy.png"))
+
+            # Set the application to pause at 200 unit intervals to implement simulated failures
+            if i == 200 or i == 400 or i == 600 or i == 800:
+                sys.stdout.write("\nPress the Enter key to resume")
+                sys.stdout.flush()
+                if sys.version_info[0] < 3:
+                    raw_input()
+                else:
+                    input()
+        except Exception as ex:
+            print(ex)
+        finally:
+            # Force an exists call to succeed by resetting the status
+            blob_client.response_callback = response_callback
+
+    # Clean up resources
+    blob_client.delete_container(container_name)
+
+
+'''
+RequestCompleted Event handler
+If it's not pointing at the secondary, let it go through. It was either successful,
+or it failed with a non-retryable event.
+If it's pointing at the secondary, increment the read count.
+If the number of reads has hit the threshold of how many reads you want to do against the secondary,
+before you switch back to primary, switch back and reset the secondary_read_count.
+'''
+
+
+def response_callback(response):
+    global secondary_read_count
+    if blob_client.location_mode == LocationMode.SECONDARY:
+
+        # You're reading the secondary. Let it read the secondary [secondaryThreshold] times,
+        # then switch back to the primary and see if it is available now.
+        secondary_read_count += 1
+        if secondary_read_count >= secondary_threshold:
+            blob_client.location_mode = LocationMode.PRIMARY
+            secondary_read_count = 0
+
+
+'''
+Retry Event handler
+If it has retried more times than allowed, and it's not already pointed to the secondary,
+flip it to the secondary and reset the retry count.
+If it has retried more times than allowed, and it's already pointed to the secondary throw an exception.
+'''
+
+
+def retry_callback(retry_context):
+    global retry_count
+    retry_count = retry_context.count
+    sys.stdout.write("\nRetrying event because of failure reading the primary. RetryCount= {0}".format(retry_count))
+    sys.stdout.flush()
+
+    # Check if we have more than n-retries in which case switch to secondary
+    if retry_count >= retry_threshold:
+
+        # Check to see if we can fail over to secondary.
+        if blob_client.location_mode != LocationMode.SECONDARY:
+            blob_client.location_mode = LocationMode.SECONDARY
+            retry_count = 0
+        else:
+            raise Exception("Both primary and secondary are unreachable. "
+                            "Check your application's network connection.")
+
+
+if __name__ == '__main__':
+    print("Azure storage Circuit Breaker Sample \n")
+    try:
+        run_circuit_breaker()
+    except Exception as e:
+        print("Error thrown = {0}".format(e))
+    sys.stdout.write("\nPress any key to exit.")
+    sys.stdout.flush()
+    if sys.version_info[0]<3:
+        raw_input()
+    else:
+        input()

+ 138 - 0
Azure/AddUp/datafactory.py

@@ -0,0 +1,138 @@
+from azure.common.credentials import ServicePrincipalCredentials
+from azure.mgmt.resource import ResourceManagementClient
+from azure.mgmt.datafactory import DataFactoryManagementClient
+from azure.mgmt.datafactory.models import *
+from datetime import datetime, timedelta
+import time
+
+def print_item(group):
+    """Print an Azure object instance."""
+    print("\tName: {}".format(group.name))
+    print("\tId: {}".format(group.id))
+    if hasattr(group, 'location'):
+        print("\tLocation: {}".format(group.location))
+    if hasattr(group, 'tags'):
+        print("\tTags: {}".format(group.tags))
+    if hasattr(group, 'properties'):
+        print_properties(group.properties)
+    print("\n")        
+
+def print_properties(props):
+    """Print a ResourceGroup properties instance."""
+    if props and hasattr(props, 'provisioning_state') and props.provisioning_state:
+        print("\tProperties:")
+        print("\t\tProvisioning State: {}".format(props.provisioning_state))
+    print("\n")
+
+def print_activity_run_details(activity_run):
+    """Print activity run details."""
+    print("\n\tActivity run details\n")
+    print("\tActivity run status: {}".format(activity_run.status))    
+    if activity_run.status == 'Succeeded':
+        print("\tNumber of bytes read: {}".format(activity_run.output['dataRead']))       
+        print("\tNumber of bytes written: {}".format(activity_run.output['dataWritten']))           
+        print("\tCopy duration: {}".format(activity_run.output['copyDuration']))           
+    else:
+        print("\tErrors: {}".format(activity_run.error['message']))
+
+def main():
+
+    # Azure subscription ID
+    subscription_id = '<Azure subscription ID>'
+
+    # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group
+    rg_name = '<Azure resource group name>'
+
+    # The data factory name. It must be globally unique.
+    df_name = '<Data factory name>'        
+
+    # Specify your Active Directory client ID, client secret, and tenant ID
+    credentials = ServicePrincipalCredentials(client_id='<AAD application ID>', secret='<AAD app authentication key>', tenant='<AAD tenant ID>')
+    resource_client = ResourceManagementClient(credentials, subscription_id)
+    adf_client = DataFactoryManagementClient(credentials, subscription_id)
+
+    rg_params = {'location':'eastus'}
+    df_params = {'location':'eastus'}
+
+    # create the resource group
+    # comment out if the resource group already exits
+    resource_client.resource_groups.create_or_update(rg_name, rg_params)
+
+    # Create a data factory
+    df_resource = Factory(location='eastus')
+    df = adf_client.factories.create_or_update(rg_name, df_name, df_resource)
+    print_item(df)
+    while df.provisioning_state != 'Succeeded':
+        df = adf_client.factories.get(rg_name, df_name)
+        time.sleep(1)
+
+    # Create an Azure Storage linked service
+    ls_name = 'storageLinkedService'
+
+    # Specify the name and key of your Azure Storage account
+    storage_string = SecureString('DefaultEndpointsProtocol=https;AccountName=<Azure storage account>;AccountKey=<Azure storage authentication key>')
+
+    ls_azure_storage = AzureStorageLinkedService(connection_string=storage_string)
+    ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage)
+    print_item(ls)
+
+    # Create an Azure blob dataset (input)
+    ds_name = 'ds_in'
+    ds_ls = LinkedServiceReference(ls_name)
+    blob_path= 'adftutorial/inputpy'
+    blob_filename = 'input.txt'
+    ds_azure_blob= AzureBlobDataset(ds_ls, folder_path=blob_path, file_name = blob_filename)
+    ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob)
+    print_item(ds)
+
+    # Create an Azure blob dataset (output)
+    dsOut_name = 'ds_out'
+    output_blobpath = 'adftutorial/outputpy'
+    dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath)
+    dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob)
+    print_item(dsOut)
+
+    # Create a copy activity
+    act_name =  'copyBlobtoBlob'
+    blob_source = BlobSource()
+    blob_sink = BlobSink()
+    dsin_ref = DatasetReference(ds_name)
+    dsOut_ref = DatasetReference(dsOut_name)
+    copy_activity = CopyActivity(act_name,inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink)
+
+    # Create a pipeline with the copy activity
+    p_name =  'copyPipeline'
+    params_for_pipeline = {}
+    p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline)
+    p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj)
+    print_item(p)
+
+    # Create a pipeline run
+    run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name,
+        {
+        }
+    )
+
+    # Monitor the pipeilne run
+    time.sleep(30)
+    pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id)
+    print("\n\tPipeline run status: {}".format(pipeline_run.status))
+    activity_runs_paged = list(adf_client.activity_runs.list_by_pipeline_run(rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1),  datetime.now() + timedelta(1)))
+    print_activity_run_details(activity_runs_paged[0])
+
+    # Create a trigger
+    tr_name = 'mytrigger'
+    scheduler_recurrence = ScheduleTriggerRecurrence(frequency='Minute', interval='15',start_time=datetime.now(), end_time=datetime.now() + timedelta(1), time_zone='UTC') 
+    pipeline_parameters = {'inputPath':'adftutorial/inputpy', 'outputPath':'adftutorial/outputpy'}
+    pipelines_to_run = []
+    pipeline_reference = PipelineReference('copyPipeline')
+    pipelines_to_run.append(TriggerPipelineReference(pipeline_reference, pipeline_parameters))
+    tr_properties = ScheduleTrigger(description='My scheduler trigger', pipelines = pipelines_to_run, recurrence=scheduler_recurrence)    
+    adf_client.triggers.create_or_update(rg_name, df_name, tr_name, tr_properties)
+
+    # start the trigger
+    adf_client.triggers.start(rg_name, df_name, tr_name)
+    
+
+# Start the main method
+main()

+ 202 - 0
Azure/AddUp/file_advanced_samples.py

@@ -0,0 +1,202 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+
+import os
+from random_data import RandomData
+
+from azure.storage.fileshare import ShareServiceClient
+from azure.storage.fileshare import CorsRule, RetentionPolicy, Metrics
+
+#
+# Azure File Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure File Service.  
+#  
+# Documentation References:  
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/  
+#  - Getting Started with Files - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-file-storage/  
+#  - File Service Concepts - http://msdn.microsoft.com/en-us/library/dn166972.aspx  
+#  - File Service REST API - http://msdn.microsoft.com/en-us/library/dn167006.aspx  
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#  
+class FileAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+    
+    # Runs all samples for Azure Storage File service.
+    def run_all_samples(self, connection_string):
+        print('Azure Storage File Advanced samples - Starting.')
+        
+        try:
+            # Create an instance of ShareServiceClient
+            service = ShareServiceClient.from_connection_string(conn_str=connection_string)
+
+            # List shares
+            print('\n\n* List shares *\n')
+            self.list_shares(service)
+
+            # Set Cors
+            print('\n\n* Set cors rules *\n')
+            self.set_cors_rules(service)
+
+            # Set Service Properties
+            print('\n\n* Set service properties *\n')
+            self.set_service_properties(service)
+
+            # Share, directory and file properties and metadata
+            print('\n\n* Metadata and properties *\n')
+            self.metadata_and_properties(service)
+
+        except Exception as e:
+            print('Error occurred in the sample.', e) 
+
+        finally:
+            print('\nAzure Storage File Advanced samples - Completed.\n')
+    
+    # List file shares
+    def list_shares(self, service):
+        share_prefix = 'sharesample' + self.random_data.get_random_name(6)
+
+        try:        
+            print('1. Create multiple shares with prefix: ', share_prefix)
+            for i in range(5):
+                service.create_share(share_name=share_prefix + str(i))
+            
+            print('2. List shares')
+            shares = service.list_shares()
+            for share in shares:
+                print('  Share name:' + share.name)
+
+        except Exception as e:
+            print(e) 
+
+        finally:
+            print('3. Delete shares with prefix:' + share_prefix) 
+            for i in range(5):
+                service.delete_share(share_prefix + str(i))
+    
+
+    # Set CORS
+    def set_cors_rules(self, service):
+        print('1. Get Cors Rules')
+        original_cors_rules = service.get_service_properties()['cors']
+
+        print('2. Overwrite Cors Rules')
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+
+        try:
+            service.set_service_properties(cors=[cors_rule])
+        except Exception as e:
+            print(e)
+        finally:
+            #reverting cors rules back to the original ones
+            print('3. Revert Cors Rules back the original ones')
+            service.set_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+    
+
+    # Manage properties of the File service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, service):
+
+        print('1. Get File service properties')
+        props = service.get_service_properties()
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite File service properties')
+            service.set_service_properties(hour_metrics=hour_metrics, minute_metrics=minute_metrics)
+
+        finally:
+            print('3. Revert File service properties back to the original ones')
+            service.set_service_properties(hour_metrics=props['hour_metrics'], minute_metrics=props['minute_metrics'])
+
+        print('4. Set File service properties completed')
+    
+
+    # Manage metadata and properties of the share
+    def metadata_and_properties(self, service):
+        share_name = 'sharename' + self.random_data.get_random_name(6)
+
+        try:
+            # All directories and share must be created in a parent share.
+            # Max capacity: 5TB per share
+            print('1. Create sample share with name ' + share_name)
+            quota = 1 # in GB
+            metadata = { "foo": "bar", "baz": "foo" }
+            share_client = service.create_share(share_name=share_name)
+            print('Sample share "'+ share_name +'" created.')
+
+            print('2. Get share properties.')
+            properties = share_client.get_share_properties()
+
+            print('3. Get share metadata.')
+            get_metadata = properties['metadata']
+            for k, v in get_metadata.items():
+                print("\t" + k + ": " + v)
+
+            dir_name = 'dirname' + self.random_data.get_random_name(6)
+
+            print('4. Create sample directory with name ' + dir_name)
+            metadata = { "abc": "def", "jkl": "mno" }
+            directory_client = share_client.create_directory(dir_name, metadata=metadata)
+            print('Sample directory "'+ dir_name +'" created.')
+
+            print('5. Get directory properties.')
+            properties = directory_client.get_directory_properties()
+            
+            print('6. Get directory metadata.')
+            get_metadata = properties['metadata']
+            for k, v in get_metadata.items():
+                print("\t" + k + ": " + v)
+
+            file_name = 'sample.txt'
+            # Uploading text to share_name/dir_name/sample.txt in Azure Files account.
+            # Max capacity: 1TB per file
+            print('7. Upload sample file from text to directory.')
+            metadata = { "prop1": "val1", "prop2": "val2" }
+            file_client = directory_client.get_file_client(file_name)
+            file_client.upload_file('Hello World! - from text sample', metadata=metadata)
+            print('Sample file "' + file_name + '" created and uploaded to: ' + share_name + '/' + dir_name)        
+
+            print('8. Get file properties.')
+            properties = file_client.get_file_properties()
+
+            print('9. Get file metadata.')
+            get_metadata = properties['metadata']
+            for k, v in get_metadata.items():
+                print("\t" + k + ": " + v)
+
+            # This is for demo purposes, all files will be deleted when share is deleted
+            print('10. Delete file.')
+            file_client.delete_file()
+
+            # This is for demo purposes, all directories will be deleted when share is deleted
+            print('11. Delete directory.')
+            directory_client.delete_directory()
+
+        finally:
+            print('12. Delete share.')
+            share_client.delete_share(share_name)
+
+        print("Metadata and properties sample completed")

+ 190 - 0
Azure/AddUp/file_basic_samples.py

@@ -0,0 +1,190 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+
+from random_data import RandomData
+import tempfile
+import os
+
+from azure.storage.fileshare import ShareServiceClient
+
+
+class FileBasicSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage File service.
+    def run_all_samples(self, connection_string):
+        print('Azure Storage File Basis samples - Starting.')
+        
+        #declare variables
+        filename = 'filesample' + self.random_data.get_random_name(6)
+        sharename = 'sharesample' + self.random_data.get_random_name(6)
+        
+        try:
+            # Create an instance of ShareServiceClient
+            service = ShareServiceClient.from_connection_string(conn_str=connection_string)
+
+            print('\n\n* Basic file operations *\n')
+            self.basic_file_operations(sharename, filename, service)
+
+        except Exception as e:
+            print('error:' + e) 
+
+        finally:
+            # Delete all Azure Files created in this sample
+            self.file_delete_samples(sharename, filename, service)
+
+        print('\nAzure Storage File Basic samples - Completed.\n')
+    
+    def basic_file_operations(self, sharename, filename, service):
+        # Creating an SMB file share in your Azure Files account.
+        print('\nAttempting to create a sample file from text for upload demonstration.')   
+        # All directories and share must be created in a parent share.
+        # Max capacity: 5TB per share
+
+        print('Creating sample share.')
+        share_client = service.create_share(share_name=sharename)
+        print('Sample share "'+ sharename +'" created.')
+
+
+        # Creating an optional file directory in your Azure Files account.
+        print('Creating a sample directory.')    
+        # Get the directory client
+        directory_client = share_client.create_directory("mydirectory")
+        print('Sample directory "mydirectory" created.')
+
+
+        # Uploading text to sharename/mydirectory/my_text_file in Azure Files account.
+        # Max capacity: 1TB per file
+        print('Uploading a sample file from text.')   
+        # create_file_client
+        file_client = directory_client.get_file_client(filename)
+        # Upload a file
+        file_client.upload_file('Hello World! - from text sample')
+        print('Sample file "' + filename + '" created and uploaded to: ' + sharename + '/mydirectory')
+  
+
+        # Demonstrate how to copy a file
+        print('\nCopying file ' + filename)
+        # Create another file client which will copy the file from url
+        destination_file_client = share_client.get_file_client('file1copy')
+
+        # Copy the sample source file from the url to the destination file
+        copy_resp = destination_file_client.start_copy_from_url(source_url=file_client.url)
+        if copy_resp['copy_status'] ==  'pending':
+            # Demonstrate how to abort a copy operation (just for demo, probably will never get here)
+            print('Abort copy operation')
+            destination_file.abort_copy()
+        else:
+            print('Copy was a ' + copy_resp['copy_status'])
+        
+
+        # Demonstrate how to create a share and upload a file from a local temporary file path
+        print('\nAttempting to upload a sample file from path for upload demonstration.')  
+        # Creating a temporary file to upload to Azure Files
+        print('Creating a temporary file from text.') 
+        with tempfile.NamedTemporaryFile(delete=False) as my_temp_file:
+            my_temp_file.file.write(b"Hello world!")
+        print('Sample temporary file created.') 
+
+        # Uploading my_temp_file to sharename folder in Azure Files
+        # Max capacity: 1TB per file
+        print('Uploading a sample file from local path.')
+        # Create file_client
+        file_client = share_client.get_file_client(filename)
+
+        # Upload a file
+        with open(my_temp_file.name, "rb") as source_file:
+            file_client.upload_file(source_file)
+
+        print('Sample file "' + filename + '" uploaded from path to share: ' + sharename)
+
+        # Close the temp file
+        my_temp_file.close()
+
+        # Get the list of valid ranges and write to the specified range
+        print('\nGet list of valid ranges of the file.') 
+        file_ranges = file_client.get_ranges()
+
+        data = b'abcdefghijkl'
+        print('Put a range of data to the file.')
+        
+        file_client.upload_range(data=data, offset=file_ranges[0]['start'], length=len(data))
+
+
+        # Demonstrate how to download a file from Azure Files
+        # The following example download the file that was previously uploaded to Azure Files
+        print('\nAttempting to download a sample file from Azure files for demonstration.')
+
+        destination_file = tempfile.tempdir + '\mypathfile.txt'
+
+        with open(destination_file, "wb") as file_handle:
+            data = file_client.download_file()
+            data.readinto(file_handle)
+
+        print('Sample file downloaded to: ' + destination_file)
+
+
+        # Demonstrate how to list files and directories contains under Azure File share
+        print('\nAttempting to list files and directories directory under share "' + sharename + '":')
+
+        # Create a generator to list directories and files under share
+        # This is not a recursive listing operation
+        generator = share_client.list_directories_and_files()
+
+        # Prints the directories and files under the share
+        for file_or_dir in generator:
+            print(file_or_dir['name'])
+        
+        # remove temp file
+        os.remove(my_temp_file.name)
+
+        print('Files and directories under share "' + sharename + '" listed.')
+        print('\nCompleted successfully - Azure basic Files operations.')
+
+
+    # Demonstrate how to delete azure files created for this demonstration
+    # Warning: Deleting a share or directory will also delete all files and directories that are contained in it.
+    def file_delete_samples(self, sharename, filename, service):
+        print('\nDeleting all samples created for this demonstration.')
+
+        try:
+            # Deleting file: 'sharename/mydirectory/filename'
+            # This is for demo purposes only, it's unnecessary, as we're deleting the share later
+            print('Deleting a sample file.')
+
+            share_client = service.get_share_client(sharename)
+            directory_client = share_client.get_directory_client('mydirectory')
+            
+            directory_client.delete_file(file_name=filename)
+            print('Sample file "' + filename + '" deleted from: ' + sharename + '/mydirectory' )
+
+            # Deleting directory: 'sharename/mydirectory'
+            print('Deleting sample directory and all files and directories under it.')
+            share_client.delete_directory('mydirectory')
+            print('Sample directory "/mydirectory" deleted from: ' + sharename)
+
+            # Deleting share: 'sharename'
+            print('Deleting sample share ' + sharename + ' and all files and directories under it.')
+            share_client.delete_share(sharename)
+            print('Sample share "' + sharename + '" deleted.')
+
+            print('\nCompleted successfully - Azure Files samples deleted.')
+
+        except Exception as e:
+            print('********ErrorDelete***********')
+            print(e)

+ 415 - 0
Azure/AddUp/python-quick-start.py

@@ -0,0 +1,415 @@
+# python quickstart client Code Sample
+#
+# Copyright (c) Microsoft Corporation
+#
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+"""
+Create a pool of nodes to output text files from azure blob storage.
+"""
+
+import datetime
+import io
+import os
+import sys
+import time
+
+from azure.storage.blob import (
+    BlobServiceClient,
+    BlobSasPermissions,
+    generate_blob_sas
+)
+from azure.batch import BatchServiceClient
+from azure.batch.batch_auth import SharedKeyCredentials
+import azure.batch.models as batchmodels
+from azure.core.exceptions import ResourceExistsError
+
+import config
+
+DEFAULT_ENCODING = "utf-8"
+
+
+# Update the Batch and Storage account credential strings in config.py with values
+# unique to your accounts. These are used when constructing connection strings
+# for the Batch and Storage client objects.
+
+def query_yes_no(question: str, default: str = "yes") -> str:
+    """
+    Prompts the user for yes/no input, displaying the specified question text.
+
+    :param str question: The text of the prompt for input.
+    :param str default: The default if the user hits <ENTER>. Acceptable values
+    are 'yes', 'no', and None.
+    :return: 'yes' or 'no'
+    """
+    valid = {'y': 'yes', 'n': 'no'}
+    if default is None:
+        prompt = ' [y/n] '
+    elif default == 'yes':
+        prompt = ' [Y/n] '
+    elif default == 'no':
+        prompt = ' [y/N] '
+    else:
+        raise ValueError(f"Invalid default answer: '{default}'")
+
+    choice = default
+
+    while 1:
+        user_input = input(question + prompt).lower()
+        if not user_input:
+            break
+        try:
+            choice = valid[user_input[0]]
+            break
+        except (KeyError, IndexError):
+            print("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
+
+    return choice
+
+
+def print_batch_exception(batch_exception: batchmodels.BatchErrorException):
+    """
+    Prints the contents of the specified Batch exception.
+
+    :param batch_exception:
+    """
+    print('-------------------------------------------')
+    print('Exception encountered:')
+    if batch_exception.error and \
+            batch_exception.error.message and \
+            batch_exception.error.message.value:
+        print(batch_exception.error.message.value)
+        if batch_exception.error.values:
+            print()
+            for mesg in batch_exception.error.values:
+                print(f'{mesg.key}:\t{mesg.value}')
+    print('-------------------------------------------')
+
+
+def upload_file_to_container(blob_storage_service_client: BlobServiceClient,
+                             container_name: str, file_path: str) -> batchmodels.ResourceFile:
+    """
+    Uploads a local file to an Azure Blob storage container.
+
+    :param blob_storage_service_client: A blob service client.
+    :param str container_name: The name of the Azure Blob storage container.
+    :param str file_path: The local path to the file.
+    :return: A ResourceFile initialized with a SAS URL appropriate for Batch
+    tasks.
+    """
+    blob_name = os.path.basename(file_path)
+    blob_client = blob_storage_service_client.get_blob_client(container_name, blob_name)
+
+    print(f'Uploading file {file_path} to container [{container_name}]...')
+
+    with open(file_path, "rb") as data:
+        blob_client.upload_blob(data, overwrite=True)
+
+    sas_token = generate_blob_sas(
+        config.STORAGE_ACCOUNT_NAME,
+        container_name,
+        blob_name,
+        account_key=config.STORAGE_ACCOUNT_KEY,
+        permission=BlobSasPermissions(read=True),
+        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=2)
+    )
+
+    sas_url = generate_sas_url(
+        config.STORAGE_ACCOUNT_NAME,
+        config.STORAGE_ACCOUNT_DOMAIN,
+        container_name,
+        blob_name,
+        sas_token
+    )
+
+    return batchmodels.ResourceFile(
+        http_url=sas_url,
+        file_path=blob_name
+    )
+
+
+def generate_sas_url(
+    account_name: str,
+    account_domain: str,
+    container_name: str,
+    blob_name: str,
+    sas_token: str
+) -> str:
+    """
+    Generates and returns a sas url for accessing blob storage
+    """
+    return f"https://{account_name}.{account_domain}/{container_name}/{blob_name}?{sas_token}"
+
+
+def create_pool(batch_service_client: BatchServiceClient, pool_id: str):
+    """
+    Creates a pool of compute nodes with the specified OS settings.
+
+    :param batch_service_client: A Batch service client.
+    :param str pool_id: An ID for the new pool.
+    :param str publisher: Marketplace image publisher
+    :param str offer: Marketplace image offer
+    :param str sku: Marketplace image sku
+    """
+    print(f'Creating pool [{pool_id}]...')
+
+    # Create a new pool of Linux compute nodes using an Azure Virtual Machines
+    # Marketplace image. For more information about creating pools of Linux
+    # nodes, see:
+    # https://azure.microsoft.com/documentation/articles/batch-linux-nodes/
+    new_pool = batchmodels.PoolAddParameter(
+        id=pool_id,
+        virtual_machine_configuration=batchmodels.VirtualMachineConfiguration(
+            image_reference=batchmodels.ImageReference(
+                publisher="canonical",
+                offer="0001-com-ubuntu-server-focal",
+                sku="20_04-lts",
+                version="latest"
+            ),
+            node_agent_sku_id="batch.node.ubuntu 20.04"),
+        vm_size=config.POOL_VM_SIZE,
+        target_dedicated_nodes=config.POOL_NODE_COUNT
+    )
+    batch_service_client.pool.add(new_pool)
+
+
+def create_job(batch_service_client: BatchServiceClient, job_id: str, pool_id: str):
+    """
+    Creates a job with the specified ID, associated with the specified pool.
+
+    :param batch_service_client: A Batch service client.
+    :param str job_id: The ID for the job.
+    :param str pool_id: The ID for the pool.
+    """
+    print(f'Creating job [{job_id}]...')
+
+    job = batchmodels.JobAddParameter(
+        id=job_id,
+        pool_info=batchmodels.PoolInformation(pool_id=pool_id))
+
+    batch_service_client.job.add(job)
+
+
+def add_tasks(batch_service_client: BatchServiceClient, job_id: str, resource_input_files: list):
+    """
+    Adds a task for each input file in the collection to the specified job.
+
+    :param batch_service_client: A Batch service client.
+    :param str job_id: The ID of the job to which to add the tasks.
+    :param list resource_input_files: A collection of input files. One task will be
+     created for each input file.
+    """
+
+    print(f'Adding {resource_input_files} tasks to job [{job_id}]...')
+
+    tasks = []
+
+    for idx, input_file in enumerate(resource_input_files):
+
+        command = f"/bin/bash -c \"cat {input_file.file_path}\""
+        tasks.append(batchmodels.TaskAddParameter(
+            id=f'Task{idx}',
+            command_line=command,
+            resource_files=[input_file]
+        )
+        )
+
+    batch_service_client.task.add_collection(job_id, tasks)
+
+
+def wait_for_tasks_to_complete(batch_service_client: BatchServiceClient, job_id: str,
+                               timeout: datetime.timedelta):
+    """
+    Returns when all tasks in the specified job reach the Completed state.
+
+    :param batch_service_client: A Batch service client.
+    :param job_id: The id of the job whose tasks should be to monitored.
+    :param timeout: The duration to wait for task completion. If all
+    tasks in the specified job do not reach Completed state within this time
+    period, an exception will be raised.
+    """
+    timeout_expiration = datetime.datetime.now() + timeout
+
+    print(f"Monitoring all tasks for 'Completed' state, timeout in {timeout}...", end='')
+
+    while datetime.datetime.now() < timeout_expiration:
+        print('.', end='')
+        sys.stdout.flush()
+        tasks = batch_service_client.task.list(job_id)
+
+        incomplete_tasks = [task for task in tasks if
+                            task.state != batchmodels.TaskState.completed]
+        if not incomplete_tasks:
+            print()
+            return True
+
+        time.sleep(1)
+
+    print()
+    raise RuntimeError("ERROR: Tasks did not reach 'Completed' state within "
+                       "timeout period of " + str(timeout))
+
+
+def print_task_output(batch_service_client: BatchServiceClient, job_id: str,
+                      text_encoding: str=None):
+    """
+    Prints the stdout.txt file for each task in the job.
+
+    :param batch_client: The batch client to use.
+    :param str job_id: The id of the job with task output files to print.
+    """
+
+    print('Printing task output...')
+
+    tasks = batch_service_client.task.list(job_id)
+
+    for task in tasks:
+
+        node_id = batch_service_client.task.get(
+            job_id, task.id).node_info.node_id
+        print(f"Task: {task.id}")
+        print(f"Node: {node_id}")
+
+        stream = batch_service_client.file.get_from_task(
+            job_id, task.id, config.STANDARD_OUT_FILE_NAME)
+
+        file_text = _read_stream_as_string(
+            stream,
+            text_encoding)
+
+        if text_encoding is None:
+            text_encoding = DEFAULT_ENCODING
+
+        sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = text_encoding)
+        sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = text_encoding)
+
+        print("Standard output:")
+        print(file_text)
+
+
+def _read_stream_as_string(stream, encoding) -> str:
+    """
+    Read stream as string
+
+    :param stream: input stream generator
+    :param str encoding: The encoding of the file. The default is utf-8.
+    :return: The file content.
+    """
+    output = io.BytesIO()
+    try:
+        for data in stream:
+            output.write(data)
+        if encoding is None:
+            encoding = DEFAULT_ENCODING
+        return output.getvalue().decode(encoding)
+    finally:
+        output.close()
+
+
+if __name__ == '__main__':
+
+    start_time = datetime.datetime.now().replace(microsecond=0)
+    print(f'Sample start: {start_time}')
+    print()
+
+    # Create the blob client, for use in obtaining references to
+    # blob storage containers and uploading files to containers.
+    blob_service_client = BlobServiceClient(
+        account_url=f"https://{config.STORAGE_ACCOUNT_NAME}.{config.STORAGE_ACCOUNT_DOMAIN}/",
+        credential=config.STORAGE_ACCOUNT_KEY
+    )
+
+    # Use the blob client to create the containers in Azure Storage if they
+    # don't yet exist.
+    input_container_name = 'input'      # pylint: disable=invalid-name
+    try:
+        blob_service_client.create_container(input_container_name)
+    except ResourceExistsError:
+        pass
+
+    # The collection of data files that are to be processed by the tasks.
+    input_file_paths = [os.path.join(sys.path[0], 'taskdata0.txt'),
+                        os.path.join(sys.path[0], 'taskdata1.txt'),
+                        os.path.join(sys.path[0], 'taskdata2.txt')]
+
+    # Upload the data files.
+    input_files = [
+        upload_file_to_container(blob_service_client, input_container_name, file_path)
+        for file_path in input_file_paths]
+
+    # Create a Batch service client. We'll now be interacting with the Batch
+    # service in addition to Storage
+    credentials = SharedKeyCredentials(config.BATCH_ACCOUNT_NAME,
+        config.BATCH_ACCOUNT_KEY)
+
+    batch_client = BatchServiceClient(
+        credentials,
+        batch_url=config.BATCH_ACCOUNT_URL)
+
+    try:
+        # Create the pool that will contain the compute nodes that will execute the
+        # tasks.
+        create_pool(batch_client, config.POOL_ID)
+
+        # Create the job that will run the tasks.
+        create_job(batch_client, config.JOB_ID, config.POOL_ID)
+
+        # Add the tasks to the job.
+        add_tasks(batch_client, config.JOB_ID, input_files)
+
+        # Pause execution until tasks reach Completed state.
+        wait_for_tasks_to_complete(batch_client,
+                                   config.JOB_ID,
+                                   datetime.timedelta(minutes=30))
+
+        print("  Success! All tasks reached the 'Completed' state within the "
+              "specified timeout period.")
+
+        # Print the stdout.txt and stderr.txt files for each task to the console
+        print_task_output(batch_client, config.JOB_ID)
+
+        # Print out some timing info
+        end_time = datetime.datetime.now().replace(microsecond=0)
+        print()
+        print(f'Sample end: {end_time}')
+        elapsed_time = end_time - start_time
+        print(f'Elapsed time: {elapsed_time}')
+        print()
+        input('Press ENTER to exit...')
+
+    except batchmodels.BatchErrorException as err:
+        print_batch_exception(err)
+        raise
+
+    finally:
+      # Clean up storage resources
+        print(f'Deleting container [{input_container_name}]...')
+        blob_service_client.delete_container(input_container_name)
+
+        # Clean up Batch resources (if the user so chooses).
+        if query_yes_no('Delete job?') == 'yes':
+            batch_client.job.delete(config.JOB_ID)
+
+        if query_yes_no('Delete pool?') == 'yes':
+            batch_client.pool.delete(config.POOL_ID)
+ 

+ 218 - 0
Azure/AddUp/table_advanced_samples.py

@@ -0,0 +1,218 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+import config
+import datetime
+import time
+from random_data import RandomData
+from tablestorageaccount import TableStorageAccount
+from azure.storage import CloudStorageAccount, AccessPolicy
+from azure.storage.table import TableService, Entity, TablePermissions
+from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
+
+#
+# Azure Table Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure Table Service
+# including creating a table, CRUD operations and different querying techniques.
+#
+# Documentation References:
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+#  - Getting Started with Tables - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-table-storage/
+#  - Table Service Concepts - http://msdn.microsoft.com/en-us/library/dd179463.aspx
+#  - Table Service REST API - http://msdn.microsoft.com/en-us/library/dd179423.aspx
+#  - Table Service Python API - http://azure.github.io/azure-storage-python/ref/azure.storage.table.html
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#
+class TableAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Table service.
+    def run_all_samples(self, account):
+        table_service = account.create_table_service()
+        print('Azure Storage Advanced Table samples - Starting.')
+        
+        print('\n\n* List tables *\n')
+        self.list_tables(table_service)
+        
+        if not account.is_azure_cosmosdb_table():
+           print('\n\n* Set service properties *\n')
+           self.set_service_properties(table_service)
+        
+           print('\n\n* Set Cors rules *\n')
+           self.set_cors_rules(table_service)
+        
+           print('\n\n* ACL operations *\n')
+           self.table_acl_operations(table_service)
+        
+        if (config.IS_EMULATED):
+            print('\n\n* Shared Access Signature is not supported in emulator *\n')
+        else:
+            print('\n\n* SAS operations *\n')
+            self.table_operations_with_sas(account)
+
+        print('\nAzure Storage Advanced Table samples - Completed.\n')
+
+    # Manage tables including creating, listing and deleting
+    def list_tables(self, table_service):
+        table_prefix = 'table' + self.random_data.get_random_name(6)
+
+        try:        
+            # Create tables
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                print('1. Create a table with name - ' + table_name)
+                table_service.create_table(table_name)
+            
+            # List all the tables 
+            print('2. List tables')
+            tables = table_service.list_tables()
+            for table in tables:
+                print('\Table Name: ' + table.name)
+
+        finally:
+            # Delete the tables
+            print("3. Delete Tables")
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                if(table_service.exists(table_name)):
+                    table_service.delete_table(table_name)
+            
+        print("List tables sample completed")
+    
+    # Manage properties of the Table service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, table_service):
+        print('1. Get Table service properties')
+        props = table_service.get_table_service_properties()
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        logging = Logging(delete=True, read=False, write=True, retention_policy=retention)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite Table service properties')
+            table_service.set_table_service_properties(logging=logging, hour_metrics=hour_metrics, minute_metrics=minute_metrics)
+
+        finally:
+            print('3. Revert Table service properties back to the original ones')
+            table_service.set_table_service_properties(logging=props.logging, hour_metrics=props.hour_metrics, minute_metrics=props.minute_metrics)
+
+        print('4. Set Table service properties completed')
+    
+    # Manage CORS rules on the table service
+    def set_cors_rules(self, table_service):
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+        
+        print('1. Get Cors Rules')
+        original_cors_rules = table_service.get_table_service_properties().cors
+
+        try:        
+            print('2. Overwrite Cors Rules')
+            table_service.set_table_service_properties(cors=[cors_rule])
+
+        finally:
+            #reverting cors rules back to the original ones
+            print('3. Revert Cors Rules back the original ones')
+            table_service.set_table_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+
+    # Manage table access policy
+    def table_acl_operations(self, table_service):
+        table_name = 'acltable' + self.random_data.get_random_name(6)
+
+        try:        
+            print('1. Create a table with name - ' + table_name)
+            table_service.create_table(table_name)
+                
+            print('2. Set access policy for table')
+            access_policy = AccessPolicy(permission=TablePermissions.QUERY,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            table_service.set_table_acl(table_name, identifiers)
+
+            print('3. Wait 30 seconds for acl to propagate')
+            time.sleep(30)
+
+            print('4. Get access policy from table')
+            acl = table_service.get_table_acl(table_name)
+
+            print('5. Clear access policy in table')
+            table_service.set_table_acl(table_name)
+
+        finally:
+            print('5. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table ACL operations sample completed")
+    
+    # Manage shared access signature on a table
+    def table_operations_with_sas(self, account):
+        table_name = 'sastable' + self.random_data.get_random_name(6)
+        
+        try:
+            # Create a Table Service object
+            table_service = account.create_table_service()
+            
+            print('1. Create table with name - ' + table_name)
+            table_service.create_table(table_name)
+            
+            # Create a Shared Access Signature for the table
+            print('2. Get sas for table')
+            
+            table_sas = table_service.generate_table_shared_access_signature(
+                table_name, 
+                TablePermissions.QUERY + TablePermissions.ADD + TablePermissions.UPDATE + TablePermissions.DELETE, 
+                datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+
+            shared_account = TableStorageAccount(account_name=account.account_name, sas_token=table_sas, endpoint_suffix=account.endpoint_suffix)
+            shared_table_service = shared_account.create_table_service()
+
+            # Create a sample entity to insert into the table
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '555-555-5555'}
+
+            # Insert the entity into the table
+            print('3. Insert new entity into table with sas - ' + table_name)
+            shared_table_service.insert_entity(table_name, customer)
+            
+            # Demonstrate how to query the entity
+            print('4. Read the inserted entity with sas.')
+            entity = shared_table_service.get_entity(table_name, 'Harp', '1')
+            
+            print(entity['email'])
+            print(entity['phone'])
+
+            # Demonstrate how to update the entity by changing the phone number
+            print('5. Update an existing entity by changing the phone number with sas')
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '425-123-1234'}
+            shared_table_service.update_entity(table_name, customer)
+
+            # Demonstrate how to delete an entity
+            print('6. Delete the entity with sas')
+            shared_table_service.delete_entity(table_name, 'Harp', '1')
+
+        finally:
+            print('7. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table operations with sas completed")

+ 96 - 0
Azure/AddUp/table_basic_samples.py

@@ -0,0 +1,96 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+import config
+from random_data import RandomData
+from azure.storage import CloudStorageAccount
+from azure.storage.table import TableService, Entity
+
+#
+# Azure Table Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure Table Service
+# including creating a table, CRUD operations and different querying techniques.
+#
+# Documentation References:
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+#  - Getting Started with Tables - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-table-storage/
+#  - Table Service Concepts - http://msdn.microsoft.com/en-us/library/dd179463.aspx
+#  - Table Service REST API - http://msdn.microsoft.com/en-us/library/dd179423.aspx
+#  - Table Service Python API - http://azure.github.io/azure-storage-ruby/
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#
+class TableBasicSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Table service.
+    def run_all_samples(self, account):
+        print('Azure Storage Basic Table samples - Starting.')
+        table_name = 'tablebasics' + self.random_data.get_random_name(6)
+        table_service = None
+        try:
+            table_service = account.create_table_service()
+
+            # Create a new table
+            print('Create a table with name - ' + table_name)
+
+            try:
+                table_service.create_table(table_name)
+            except Exception as err:
+                print('Error creating table, ' + table_name + 'check if it already exists')
+ 
+            # Create a sample entity to insert into the table
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '555-555-5555'}
+
+            # Insert the entity into the table
+            print('Inserting a new entity into table - ' + table_name)
+            table_service.insert_entity(table_name, customer)
+            print('Successfully inserted the new entity')
+
+            # Demonstrate how to query the entity
+            print('Read the inserted entity.')
+            entity = table_service.get_entity(table_name, 'Harp', '1')
+            print(entity['email'])
+            print(entity['phone'])
+
+            # Demonstrate how to update the entity by changing the phone number
+            print('Update an existing entity by changing the phone number')
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '425-123-1234'}
+            table_service.update_entity(table_name, customer)
+
+            # Demonstrate how to query the updated entity, filter the results with a filter query and select only the value in the phone column
+            print('Read the updated entity with a filter query')
+            entities = table_service.query_entities(table_name, filter="PartitionKey eq 'Harp'", select='phone')
+            for entity in entities:
+                print(entity['phone'])
+
+            # Demonstrate how to delete an entity
+            print('Delete the entity')
+            table_service.delete_entity(table_name, 'Harp', '1')
+            print('Successfully deleted the entity')
+
+        except Exception as e:
+            if (config.IS_EMULATED):
+                print('Error occurred in the sample. If you are using the emulator, please make sure the emulator is running.', e)
+            else: 
+                print('Error occurred in the sample. Please make sure the account name and key are correct.', e)
+        finally:
+            # Demonstrate deleting the table, if you don't want to have the table deleted comment the below block of code
+            print('Deleting the table.')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            print('Successfully deleted the table')
+
+        print('\nAzure Storage Basic Table samples - Completed.\n')

+ 1 - 0
Azure/AzureStorage

@@ -0,0 +1 @@
+Subproject commit ac4dbd83e307a5b8d3fd3b77103ec837b821c564

+ 125 - 0
Azure/DLfile.py

@@ -0,0 +1,125 @@
+from azure.datalake.store import core, lib
+import config
+
+import sys, io
+import schedule, threading, time
+
+from datetime import datetime
+
+from os import listdir
+from os.path import isfile, join
+
+
+import glob
+
+
+def run_once_threaded(job_func):
+    job_thread = threading.Thread(target=job_func)
+    job_thread.start()
+    return schedule.CancelJob
+
+def run_threaded(job_func):
+    job_thread = threading.Thread(target=job_func)
+    job_thread.start()
+    
+
+local_upload_folder_path = "LOCAL_FOLDER_PATH"
+adls_upload_folder_path = "ADLS_FOLDER_PATH"
+
+
+orginal_stdout = sys.stdout
+
+buf = io.StringIO()
+sys.stdout = buf
+adlCreds = -1
+
+uploaded_files = False
+
+def postToTeams():
+ output = buf.getvalue()
+ if output == "":
+  return
+ orginal_stdout.write(output)
+
+  
+ now = datetime.now()
+ current_time = now.strftime("%H:%M:%S")
+ 
+ config.sendToTeams("{}<br>{}".format(current_time, output))
+ 
+ buf.truncate(0)
+ buf.seek(0)
+ 
+def authenticate():
+ global adlCreds
+ adlCreds = lib.auth(config.azure_tenant_id)
+
+
+def authenticated():
+ if adlCreds ==  -1:
+  return
+  
+#  print("Authentication sucess!")
+  
+ run_once_threaded(upload_files)
+ 
+ return schedule.CancelJob
+
+ 
+def upload_files():
+ adl = core.AzureDLFileSystem(adlCreds, store_name=config.store_name)
+ uploadedFolders = adl.ls(adls_upload_folder_path)
+ 
+ uploadedFolders = set([folder.replace(adls_upload_folder_path[1:], "")+"/" for folder in uploadedFolders])
+ 
+ local_folders = glob.glob(local_upload_folder_path+"*") # * means all if need specific format then *.csv
+ local_folders = set([d.replace(local_upload_folder_path, "")+"/" for d in local_folders])
+
+ to_upload_folders = local_folders.difference(uploadedFolders)
+
+ folder_names = sorted([d.replace(local_upload_folder_path, "") for d in to_upload_folders])
+
+ files = []
+ for folder in folder_names:
+  path = local_upload_folder_path+folder
+  for f in listdir(path):
+   if isfile(join(path, f)):
+    files.append(folder+f)
+
+
+ print("Uploading the following folders:<br>{}<br>Total number of files to upload:<br>{}".format(", ". join(folder_names), len(files)))
+ 
+
+ for f in files:
+  adl.put(local_upload_folder_path+f, adls_upload_folder_path+f)
+    
+
+ print("Upload finished.")
+ time.sleep(2)
+ global uploaded_files
+ uploaded_files = True
+
+
+def exit_program():
+ if uploaded_files == True:
+  exit()
+
+schedule.every(2).seconds.do(run_threaded, postToTeams)
+schedule.every().seconds.do(run_once_threaded, authenticate)
+schedule.every().seconds.do(authenticated)
+schedule.every().seconds.do(exit_program)
+
+
+while 1:
+    schedule.run_pending()
+    time.sleep(1) 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 

+ 1 - 0
Azure/azure-multiapi-storage-python

@@ -0,0 +1 @@
+Subproject commit dc0e7dc1066ca4cd2d6006a5bccd7ec37521ec1c

+ 64 - 0
Azure/blob-adapter.py

@@ -0,0 +1,64 @@
+import configparser
+from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
+from azure.core.exceptions import HttpResponseError, ResourceExistsError
+from flask import jsonify
+
+class AzureBlobAdapter:
+    FILE_PREFIX = 'IN_CARE'
+    blob_service_client: BlobServiceClient
+    blob_client: BlobClient
+    container_client: ContainerClient
+    configs = configparser.ConfigParser()
+    configs.read('azure_blob.cfg')
+
+    # init method or constructor
+
+    def __init__(self):
+        connection_string = self.get_config('connection_string')
+        print("Azure Blob Storage v" + __version__ +
+              " - Blob Python libs")
+        self.blob_service_client = BlobServiceClient.from_connection_string(
+            connection_string)
+
+    def upload(self, file_dict):
+        upload_response = {}
+        for key in file_dict:
+            print("File Dict Key: [{}] value is: {}".format(key, file_dict[key]))
+            print("\nUploading to Azure Storage as blob:\n\t" + key)
+
+            self.blob_client = self.blob_service_client.get_blob_client(container=self.get_config('container_name'), blob=key)
+            with open(file_dict[key], "rb") as data:
+                try:
+                    self.blob_client.upload_blob(data)
+                    print('File: Uploaded Successfully: {}'.format(key))
+                    upload_response[key] = 'Successfully Uploaded'
+                except ResourceExistsError:
+                    print('File: NOT Uploaded Successfully: {}'.format(key))
+                    upload_response[key] = 'This Resource already exists'
+                    upload_response['Partial'] = True
+                    print('This Resource already exists')
+                    # return 'This Resource already exists'
+        print("Before Returning Response:")
+        print(jsonify(upload_response))
+        print("---------------")
+        return upload_response
+
+    def get_blob_client(self, blob_name):
+        self.blob_client = self.blob_service_client.get_blob_client(
+            container=self.get_config('container_name'), blob=blob_name)
+        return self.blob_client
+
+    def list_blobs(self):
+        print("\nList blobs in the container")
+        self.container_client = self.blob_service_client.get_container_client(
+            container=self.get_config('container_name'))
+        blob_list = self.container_client.list_blobs()
+        blobs = []
+        for blob in blob_list:
+            # print("\t Blob name: " + blob.name)
+            blobs.append(blob.name)
+        return blobs
+
+    def get_config(self, app_property):
+        config_value = self.configs['azure_blob_config'][app_property]
+        return config_value

+ 98 - 0
Azure/blob-permission.py

@@ -0,0 +1,98 @@
+from datetime import datetime, timedelta
+
+from azure.storage.blob import BlobSasPermissions, generate_blob_sas
+
+from azurebatchload.checks import Checks
+
+
+class Base(Checks):
+    def __init__(
+        self,
+        destination,
+        folder,
+        extension=None,
+        modified_since=None,
+        method="batch",
+        list_files=None,
+        expiry_download_links=7,
+    ):
+        super().__init__(directory=folder)
+
+        self.destination = destination
+        self.folder = folder
+        self.extension = extension
+        self.modified_since = modified_since
+        if not self._check_azure_cli_installed():
+            self.method = "single"
+        else:
+            self.method = method
+        self.list_files = list_files
+        credentials = self._check_connection_credentials()
+        self.connection_string = credentials[0]
+        self.account_name = credentials[1]
+        self.account_key = credentials[2]
+        self.expiry_download_links = expiry_download_links
+
+    def checks(self):
+        allowed_methods = ("batch", "single")
+        if self.method not in allowed_methods:
+            raise ValueError(f"Method {self.method} is not a valid method. Choose from {' or '.join(allowed_methods)}.")
+
+        if self.list_files and self.method == "batch":
+            raise ValueError("list_files is only allowed with method='single'.")
+
+        if self.list_files and not isinstance(self.list_files, list):
+            raise ValueError(f"Argument list_files was set, but is not of type list, but type {type(self.list_files)}")
+
+    def create_blob_link(self, blob_folder, blob_name) -> str:
+        if blob_folder:
+            full_path_blob = f"{blob_folder}/{blob_name}"
+        else:
+            full_path_blob = blob_name
+        url = f"https://{self.account_name}.blob.core.windows.net/{self.destination}/{full_path_blob}"
+        sas_token = generate_blob_sas(
+            account_name=self.account_name,
+            account_key=self.account_key,
+            container_name=self.destination,
+            blob_name=full_path_blob,
+            permission=BlobSasPermissions(read=True, delete_previous_version=False),
+            expiry=datetime.utcnow() + timedelta(days=self.expiry_download_links),
+        )
+
+        url_with_sas = f"{url}?{sas_token}"
+        return url_with_sas
+
+    @staticmethod
+    def create_not_case_sensitive_extension(extension):
+        """
+        We create in-case sensitive fnmatch
+        .pdf -> .[Pp][Dd][Ff]
+        .csv -> .[Cc][Ss][Vv]
+        """
+        new_extension = ""
+        for letter in extension:
+            if not letter.isalpha():
+                new_extension += letter
+            else:
+                new_extension += f"[{letter.upper()}{letter}]"
+
+        if not new_extension.startswith("*"):
+            new_extension = "*" + new_extension
+
+        return new_extension
+
+    def define_pattern(self):
+        self.extension = self.create_not_case_sensitive_extension(self.extension)
+        if self.folder and not self.extension:
+            if self.folder.endswith("/"):
+                pattern = self.folder + "*"
+            else:
+                pattern = self.folder + "/*"
+        elif self.folder and self.extension:
+            pattern = self.folder.rstrip("/") + "/" + "*" + self.extension
+        elif not self.folder and self.extension:
+            pattern = "*" + self.extension
+        else:
+            pattern = None
+
+        return pattern

+ 101 - 0
Azure/blob-upload-1.py

@@ -0,0 +1,101 @@
+import logging
+import os
+
+from azure.storage.blob import BlobServiceClient
+
+from azurebatchload.core import Base
+
+
+class Upload(Base):
+    def __init__(
+        self,
+        destination,
+        source,
+        folder=None,
+        extension=None,
+        method="batch",
+        modified_since=None,
+        overwrite=False,
+        list_files=None,
+        create_download_links=False,
+        expiry_download_links=7,
+    ):
+        super(Upload, self).__init__(
+            destination=destination,
+            folder=source,
+            extension=extension,
+            modified_since=modified_since,
+            method=method,
+            list_files=list_files,
+            expiry_download_links=expiry_download_links,
+        )
+        self.blob_folder = folder
+        self.overwrite = overwrite
+        self.create_download_links = create_download_links
+
+    def upload_batch(self):
+        cmd = f"az storage fs directory upload " f"-f {self.destination} " f"-s {self.folder} -r"
+
+        non_default = {"-d": self.blob_folder, "--connection-string": self.connection_string}
+
+        for flag, value in non_default.items():
+            if value:
+                cmd = f"{cmd} {flag} '{value}'"
+
+        os.system(cmd)
+
+    def upload_single(self):
+        blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
+        download_links = {}
+
+        for root, dirs, files in os.walk(self.folder):
+            for file in files:
+
+                full_path = os.path.join(root, file)
+
+                # ignore hidden files
+                if file.startswith("."):
+                    continue
+
+                # if list_files is given, only upload matched files
+                if self.list_files and file not in self.list_files:
+                    continue
+
+                # if extension is given only upload if extension is matched
+                if self.extension and os.path.isfile(full_path) and not file.lower().endswith(self.extension.lower()):
+                    continue
+
+                blob_folder = root.replace(self.folder, "").lstrip("/")
+
+                if self.blob_folder:
+                    # we only want to append blob_folder if it actually is a path or folder
+                    # blob_folder can be empty string ""
+                    if blob_folder:
+                        blob_folder = os.path.join(self.blob_folder, blob_folder)
+                    else:
+                        blob_folder = self.blob_folder
+
+                # if no folder is given, just upload to the container root path
+                if not blob_folder:
+                    container = self.destination
+                else:
+                    container = os.path.join(self.destination, blob_folder)
+                container_client = blob_service_client.get_container_client(container=container)
+
+                with open(full_path, "rb") as data:
+                    logging.debug(f"Uploading blob {full_path}")
+                    container_client.upload_blob(data=data, name=file, overwrite=self.overwrite)
+
+                if self.create_download_links:
+                    download_links[file] = self.create_blob_link(blob_folder=blob_folder, blob_name=file)
+
+        return download_links
+
+    def upload(self):
+        self.checks()
+
+        logging.info(f"Uploading to container {self.destination} with method = '{self.method}'.")
+        if self.method == "batch":
+            return self.upload_batch()
+        else:
+            return self.upload_single()

+ 81 - 0
Azure/blob-upload-2.py

@@ -0,0 +1,81 @@
+import requests
+from bs4 import BeautifulSoup as bs
+import os
+from azure.storage.blob import BlobServiceClient, BlobClient
+from azure.storage.blob import ContentSettings, ContainerClient
+
+#Your Connexion String
+MY_CONNECTION_STRING = "DefaultEndpointsProtocol************************"
+#Your Container Name
+MY_IMAGE_CONTAINER = "picture"
+#Your local path
+LOCAL_IMAGE_PATH = "..\Picture"
+#change the url to the one you want to scrape
+URL = 'WebSiteURL'
+
+class AzureBlobStorage:
+    def Scrapp(self):
+        #create folder with the picture if it doesn't exist
+        if not os.path.exists('.\Picture'):
+            os.mkdir('.\Picture')
+        os.chdir('.\Picture')
+        #Change the number to begin where you want to start
+        page_begin = 1
+        #Change the number to the number of pages you want to scrape
+        page_end = 230 + 1
+
+        #If you want to scrape only one page, change the page_end to page_begin or delete the loop
+        for page in range(page_begin, page_end):
+            req = requests.get(URL + str(page))
+            soup = bs(req.text, 'html.parser')
+            images = soup.find_all('img')
+            for images in images:
+                name = images['src']
+                alpha = images['src']
+                link = 'WebSiteURL' + alpha
+                print(link)
+                #replace the name of the photo it's better :))
+                with open(name.replace(' ', '-').replace('/', '').replace('"', "'").replace('.jpg','') + '.jpg','wb') as f:
+                    im = requests.get(link)
+                    f.write(im.content)
+                    #check the name on the terminal
+                    print('Writing: ', name)
+
+    def __init__(self):
+        # Initialize the connection to Azure storage account
+        self.blob_service_client = BlobServiceClient.from_connection_string(MY_CONNECTION_STRING)
+
+    def upload_all_images_in_folder(self):
+        # Get all files with jpg extension and exclude directories
+        all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
+                          if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
+        # Upload each file
+        for file_name in all_file_names:
+            self.upload_image(file_name)
+
+    def upload_image(self, file_name):
+        # Create blob with same name as local file name
+        blob_client = self.blob_service_client.get_blob_client(container=MY_IMAGE_CONTAINER,
+                                                               blob=file_name)
+        # Get full path to the file
+        upload_file_path = os.path.join(LOCAL_IMAGE_PATH, file_name)
+        # Create blob on storage
+        # Overwrite if it already exists!
+        image_content_setting = ContentSettings(content_type='image/jpeg')
+        print(f"uploading file - {file_name}")
+        with open(upload_file_path, "rb") as data:
+            blob_client.upload_blob(data, overwrite=True, content_settings=image_content_setting)
+
+    def upload_all_images_in_folder(self):
+        # Get all files with jpg extension and exclude directories
+        all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
+                          if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
+        # Upload each file
+        for file_name in all_file_names:
+            self.upload_image(file_name)
+if __name__=='__main__':
+        
+    # Initialize class and upload files
+    azure_blob_file_uploader = AzureBlobStorage()
+    azure_blob_file_uploader.Scrapp()
+    azure_blob_file_uploader.upload_all_images_in_folder()

+ 57 - 0
Azure/blob-upload-3.py

@@ -0,0 +1,57 @@
+from flask import Flask
+from flask import jsonify
+from flask import request
+from werkzeug import secure_filename
+from azure.storage.blob import BlockBlobService
+import os
+
+
+app = Flask(__name__, static_folder='static', static_url_path='')
+
+app.config['ALLOWED_EXTENSIONS'] = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
+app.config['MAX_CONTENT_LENGTH'] = 1 * 1024 * 1024    # 1 Mb limit
+app.config['AZURE_STORAGE_ACCOUNT'] = "flasktest"
+app.config['AZURE_STORAGE_CONTAINER'] = "doc"
+app.config['AZURE_STORAGE_KEY'] = os.environ['AZURE_STORAGE_KEY']
+try:
+    os.environ['FLASK_DEBUG']
+    app.debug = True
+except KeyError:
+    app.debug = False
+
+
+def allowed_file(filename):
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
+
+@app.route('/')
+def root():
+    return app.send_static_file('index.html')
+
+
+# basedir = os.path.abspath(os.path.dirname(__file__))
+
+@app.route('/uploadajax', methods=['POST'])
+def upldfile():
+    if request.method == 'POST':
+        file = request.files['file']
+        if file and allowed_file(file.filename):
+            filename = secure_filename(file.filename)
+            app.logger.info('FileName: ' + filename)
+            
+            block_blob_service = BlockBlobService(account_name=app.config['AZURE_STORAGE_ACCOUNT'], account_key=app.config['AZURE_STORAGE_KEY'])
+            block_blob_service.create_blob_from_bytes(
+                'doc',
+                filename,
+                file.read())
+            
+#             updir = os.path.join(basedir, 'upload/')
+#             file.save(os.path.join(updir, filename))
+#             file_size = os.path.getsize(os.path.join(updir, filename))
+            return jsonify(name=filename, url='https://'+app.config['AZURE_STORAGE_ACCOUNT']+'.blob.core.windows.net/' \
+                           +app.config['AZURE_STORAGE_CONTAINER']+'/'+filename)
+
+
+
+if __name__ == '__main__':
+ app.run()

+ 67 - 0
Azure/blob-upload-4.py

@@ -0,0 +1,67 @@
+import os, uuid
+from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
+import argparse
+from argparse import ArgumentParser
+
+parser = ArgumentParser()
+parser.add_argument('--connect_str', default='', type=str)
+parser.add_argument('--container_name', default='', type=str)
+parser.add_argument('--source', default='', type=str)
+parser.add_argument('--target', default='', type=str)
+parser.add_argument('--is_directory', default=False, action='store_true')
+parser.add_argument('--download', default=False, action='store_true')
+parser.add_argument('--upload', default=False, action='store_true')
+arg = parser.parse_args()
+
+connect_str = arg.connect_str #Enter your connection string here! Refer to https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=environment-variable-windows for more info
+container_name = arg.container_name #Enter your continaer name from azure blob storage here!
+blob_service_client = BlobServiceClient.from_connection_string(connect_str) # Create the BlobServiceClient object which will be used to create a container client
+
+def upload_file_to_blob(upload_file_path, target): #file path - >file path
+    blob_client = blob_service_client.get_blob_client(container=container_name, blob=target)
+    print("\nUploading to Azure Storage as blob:\n\t" + upload_file_path)
+    with open(upload_file_path, "rb") as data:
+        blob_client.upload_blob(data)
+
+def upload_directory_to_blob(upload_file_path, target): #directory name -> directory name
+    print("\nUploading directory to Azure Storage as blob:\n\t" + upload_file_path)
+    files = os.listdir(upload_file_path)
+    for dir in files:
+        file_name = upload_file_path + '/' + dir
+        target_ = target+ '/' + dir
+        blob_client = blob_service_client.get_blob_client(container=container_name, blob=target_)
+        with open(file_name, "rb") as data:
+            blob_client.upload_blob(data)
+
+def download_file_from_blob(source, download_file_path):
+    blob_client = blob_service_client.get_blob_client(container=container_name, blob=source)
+    print("\nDownloading blob to \n\t from container" + download_file_path)
+
+    with open(download_file_path, "wb") as download_file:
+        download_file.write(blob_client.download_blob().readall())
+
+def download_directory_from_blob(source, download_directory_path):
+    container_client = ContainerClient.from_connection_string(conn_str=connect_str, container_name=container_name)
+    print(f"\nDownloading all blobs from the following directory {source} in container {container_name}")
+    blob_list = container_client.list_blobs()
+    for blob in blob_list:
+        if source in blob.name:
+            blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob.name)
+            os.makedirs(os.path.dirname(blob.name), exist_ok=True)
+            with open(blob.name, "wb") as download_file:
+                download_file.write(blob_client.download_blob().readall())
+
+
+if not arg.download and not arg.upload:
+    raise Exception('Specificy either --upload or --download. Specify only one.')
+
+if arg.download: #downloading from source to target
+    if not arg.is_directory:
+        download_file_from_blob(arg.source, arg.target)
+    else:
+        download_directory_from_blob(arg.source, arg.target)
+else: #Uploading source to target
+    if not arg.is_directory:
+        upload_file_to_blob(arg.source, arg.target)
+    else:
+        upload_directory_to_blob(arg.source, arg.target)

+ 107 - 0
Azure/blob-upload.py

@@ -0,0 +1,107 @@
+# ----------------------------------------------------------------------------------
+# MIT License
+#
+# Copyright(c) Microsoft Corporation. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# ----------------------------------------------------------------------------------
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+
+import os
+import uuid
+import sys
+from azure.storage.blob import BlockBlobService, PublicAccess
+
+# ---------------------------------------------------------------------------------------------------------
+# Method that creates a test file in the 'Sample' folder.
+# This sample application creates a test file, uploads the test file to the Blob storage,
+# lists the blobs in the container, and downloads the file with a new name.
+# ---------------------------------------------------------------------------------------------------------
+# Documentation References:
+# Associated Article - https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python
+# What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+# Getting Started with Blobs-https://docs.microsoft.com/en-us/azure/storage/blobs/storage-python-how-to-use-blob-storage
+# Blob Service Concepts - http://msdn.microsoft.com/en-us/library/dd179376.aspx
+# Blob Service REST API - http://msdn.microsoft.com/en-us/library/dd135733.aspx
+# ----------------------------------------------------------------------------------------------------------
+
+
+def run_sample():
+    try:
+        # Create the BlockBlobService that is used to call the Blob service for the storage account
+        blob_service_client = BlockBlobService(
+            account_name='accountname', account_key='accountkey')
+
+        # Create a container called 'quickstartblobs'.
+        container_name = 'quickstartblobs'
+        blob_service_client.create_container(container_name)
+
+        # Set the permission so the blobs are public.
+        blob_service_client.set_container_acl(
+            container_name, public_access=PublicAccess.Container)
+
+        # Create Sample folder if it not exists, and create a file in folder Sample to test the upload and download.
+        local_path = os.path.expanduser("~/Sample")
+        if not os.path.exists(local_path):
+            os.makedirs(os.path.expanduser("~/Sample"))
+        local_file_name = "QuickStart_" + str(uuid.uuid4()) + ".txt"
+        full_path_to_file = os.path.join(local_path, local_file_name)
+
+        # Write text to the file.
+        file = open(full_path_to_file,  'w')
+        file.write("Hello, World!")
+        file.close()
+
+        print("Temp file = " + full_path_to_file)
+        print("\nUploading to Blob storage as blob" + local_file_name)
+
+        # Upload the created file, use local_file_name for the blob name
+        blob_service_client.create_blob_from_path(
+            container_name, local_file_name, full_path_to_file)
+
+        # List the blobs in the container
+        print("\nList blobs in the container")
+        generator = blob_service_client.list_blobs(container_name)
+        for blob in generator:
+            print("\t Blob name: " + blob.name)
+
+        # Download the blob(s).
+        # Add '_DOWNLOADED' as prefix to '.txt' so you can see both files in Documents.
+        full_path_to_file2 = os.path.join(local_path, str.replace(
+            local_file_name ,'.txt', '_DOWNLOADED.txt'))
+        print("\nDownloading blob to " + full_path_to_file2)
+        blob_service_client.get_blob_to_path(
+            container_name, local_file_name, full_path_to_file2)
+
+        sys.stdout.write("Sample finished running. When you hit <any key>, the sample will be deleted and the sample "
+                         "application will exit.")
+        sys.stdout.flush()
+        input()
+
+        # Clean up resources. This includes the container and the temp files
+        blob_service_client.delete_container(container_name)
+        os.remove(full_path_to_file)
+        os.remove(full_path_to_file2)
+    except Exception as e:
+        print(e)
+
+
+# Main method.
+if __name__ == '__main__':
+    run_sample()

+ 221 - 0
Azure/django-blob.py

@@ -0,0 +1,221 @@
+import mimetypes
+import datetime
+
+from azure.common import AzureMissingResourceHttpError
+from azure.storage.blob import BlobService
+
+from django.core.files.storage import Storage
+from django.conf import settings
+
+try:
+    from django.utils.deconstruct import deconstructible
+except ImportError:
+    # Support for django 1.7 and below
+    def deconstructible(func):
+        return func
+
+
+@deconstructible
+class AzureStorage(Storage):
+    """
+    Custom file storage system for Azure
+    """
+
+    container = settings.AZURE_STORAGE.get('CONTAINER')
+    account_name = settings.AZURE_STORAGE.get('ACCOUNT_NAME')
+    account_key = settings.AZURE_STORAGE.get('ACCOUNT_KEY')
+    cdn_host = settings.AZURE_STORAGE.get('CDN_HOST')
+    use_ssl = settings.AZURE_STORAGE.get('USE_SSL')
+
+    def __init__(self, account_name=None, account_key=None, container=None,
+         use_ssl=None, cdn_host=None):
+
+        if account_name is not None:
+            self.account_name = account_name
+
+        if account_key is not None:
+            self.account_key = account_key
+
+        if container is not None:
+            self.container = container
+
+        if use_ssl is not None:
+            self.use_ssl = use_ssl
+
+        if cdn_host is not None:
+            self.cdn_host = cdn_host
+
+    def __getstate__(self):
+        return dict(
+            account_name=self.account_name,
+            account_key=self.account_key,
+            container=self.container,
+            cdn_host=self.cdn_host,
+            use_ssl=self.use_ssl
+        )
+
+    def _get_service(self):
+        if not hasattr(self, '_blob_service'):
+            self._blob_service = BlobService(
+                account_name=self.account_name,
+                account_key=self.account_key,
+                protocol='https' if self.use_ssl else 'http'
+            )
+
+        return self._blob_service
+
+    def _get_properties(self, name):
+        return self._get_service().get_blob_properties(
+            container_name=self.container,
+            blob_name=name
+        )
+
+    def _open(self, name, mode='rb'):
+        """
+        Return the AzureStorageFile.
+        """
+
+        from django.core.files.base import ContentFile
+
+        contents = self._get_service().get_blob_to_bytes(
+            container_name=self.container,
+            blob_name=name
+        )
+
+        return ContentFile(contents)
+
+    def _save(self, name, content):
+        """
+        Use the Azure Storage service to write ``content`` to a remote file
+        (called ``name``).
+        """
+        
+
+        content.open()
+
+        content_type = None
+
+        if hasattr(content.file, 'content_type'):
+            content_type = content.file.content_type
+        else:
+            content_type = mimetypes.guess_type(name)[0]
+
+        cache_control = self.get_cache_control(
+            self.container,
+            name,
+            content_type
+        )
+
+        self._get_service().put_block_blob_from_file(
+            container_name=self.container,
+            blob_name=name,
+            stream=content,
+            x_ms_blob_content_type=content_type,
+            cache_control=cache_control,
+            x_ms_blob_cache_control=cache_control
+        )
+
+        content.close()
+
+        return name
+
+    def listdir(self, path):
+        """
+        Lists the contents of the specified path, returning a 2-tuple of lists;
+        the first item being directories, the second item being files.
+        """
+
+        files = []
+
+        if path and not path.endswith('/'):
+            path = '%s/' % path
+
+        path_len = len(path)
+
+        if not path:
+            path = None
+
+        blob_list = self._get_service().list_blobs(self.container, prefix=path)
+
+        for name in blob_list:
+            files.append(name[path_len:])
+
+        return ([], files)
+
+    def exists(self, name):
+        """
+        Returns True if a file referenced by the given name already exists in
+        the storage system, or False if the name is available for a new file.
+        """
+        try:
+            self._get_properties(name)
+
+            return True
+        except AzureMissingResourceHttpError:
+            return False
+
+    def delete(self, name):
+        """
+        Deletes the file referenced by name.
+        """
+
+        try:
+            self._get_service().delete_blob(self.container, name)
+        except AzureMissingResourceHttpError:
+            pass
+
+    def get_cache_control(self, container, name, content_type):
+        """
+        Get the Cache-Control value for a blob, used when saving the blob on
+        Azure.  Returns `None` by default to remain compatible with the
+        default setting for the SDK.
+        """
+
+        return None
+
+    def size(self, name):
+        """
+        Returns the total size, in bytes, of the file referenced by name.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return int(properties['content-length'])
+        except AzureMissingResourceHttpError:
+            pass
+
+    def url(self, name):
+        """
+        Returns the URL where the contents of the file referenced by name can
+        be accessed.
+        """
+
+        blob_url_args = {
+            'container_name': self.container,
+            'blob_name': name,
+        }
+
+        if self.cdn_host:
+            # The account name should be built into the cdn hostname
+            blob_url_args['account_name'] = ''
+            blob_url_args['host_base'] = self.cdn_host
+
+        return self._get_service().make_blob_url(
+            **blob_url_args
+        )
+
+    def modified_time(self, name):
+        """
+        Returns a datetime object containing the last modified time.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return datetime.datetime.strptime(
+                properties['last-modified'],
+                '%a, %d %b %Y %H:%M:%S %Z'
+            )
+        except AzureMissingResourceHttpError:
+            pass

+ 1 - 0
Azure/python-text-classification

@@ -0,0 +1 @@
+Subproject commit 8078e57805781f1453f1dd7ea84f8b93aa70cafa

+ 555 - 0
Azure/storage-blob.py

@@ -0,0 +1,555 @@
+#----------------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious.  No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#----------------------------------------------------------------------------------
+
+import os
+import config
+from random_data import RandomData
+import base64
+import datetime
+import time
+from azure.storage import CloudStorageAccount, AccessPolicy
+from azure.storage.blob import BlockBlobService, PageBlobService, AppendBlobService
+from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
+from azure.storage.blob.models import BlobBlock, ContainerPermissions, ContentSettings
+#
+# Azure Storage Blob Sample - Demonstrate how to use the Blob Storage service. 
+# Blob storage stores unstructured data such as text, binary data, documents or media files. 
+# Blobs can be accessed from anywhere in the world via HTTP or HTTPS. 
+#
+ 
+# Documentation References: 
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/ 
+#  - Getting Started with Blobs - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-blob-storage/
+#  - Blob Service Concepts - http://msdn.microsoft.com/en-us/library/dd179376.aspx 
+#  - Blob Service REST API - http://msdn.microsoft.com/en-us/library/dd135733.aspx 
+#  - Blob Service Python API - http://azure.github.io/azure-storage-python/ref/azure.storage.blob.html
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/ 
+#
+class BlobAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Blob service.
+    # Input Arguments:
+    # account - CloudStorageAccount to use for running the samples
+    def run_all_samples(self, account):
+        print('\n\nAzure Storage Blob advanced sample - Starting.')
+        
+        try:
+            print('\n\n* Container operations *\n')
+            self.list_containers(account)
+
+            print('\n\n* Set CORS *\n')
+            self.set_cors_rules(account)
+
+            print('\n\n* Container lease *\n')
+            self.lease_container(account)
+
+            print('\n\n* Copy blob *\n')
+            self.copy_blob(account)
+            
+            print('\n\n* Page blob operations *\n')
+            self.page_blob_operations(account)
+            
+            print('\n\n* Block blob operations *\n')
+            self.block_blob_operations(account)
+
+            print('\n\n* Properties and Metadata operations *\n')
+            self.properties_and_metadata_operations(account)
+            
+            print('\n\n* Container ACL operations *\n')
+            self.container_acl_operations(account)
+
+            print('\n\n* Blob lease *\n')
+            self.lease_blob(account)  
+            
+            if (config.IS_EMULATED):
+                print('\nShared Access Signature is not supported in emulator');
+            else:
+                print('\n\n* Container with SAS operations *\n')
+                self.container_operations_with_sas(account)      
+  
+                print('\n\n* SAS with access policy *\n')
+                self.sas_with_container_access_policy(account)
+
+                print('\n\n* Set blob service logging and metrics properties *\n')
+                self.set_service_properties(account)
+
+        except Exception as e:
+            if (config.IS_EMULATED):
+                print('Error occurred in the sample. If you are using the emulator, please make sure the emulator is running.', e)
+            else: 
+                print('Error occurred in the sample. Please make sure the account name and key are correct.', e)
+
+        finally:
+            print('\nAzure Storage Blob advanced sample - Completed.\n')
+
+
+    # Copy a source blob to a destination blob
+    def copy_blob(self, account):
+
+        file_upload = "HelloWorld.png"
+        container_name = 'blockblobcontainer' + self.random_data.get_random_name(6)
+
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+                    
+            # Upload file as a block blob
+            print('2. Upload BlockBlob')
+            #Get full path on drive to file_to_upload by joining the fully qualified directory name and file name on the local drive
+            full_path_to_file = os.path.join(os.path.dirname(__file__), file_upload)
+            blockblob_service.create_blob_from_path(container_name, file_upload, full_path_to_file)
+
+            target_blob = "target.png"
+            blob_source_url = blockblob_service.make_blob_url(container_name, file_upload)
+
+            print('3. Copy blob')
+            blockblob_service.copy_blob(container_name, target_blob, blob_source_url)
+
+            print('4. Get target blob')
+            target_blob_properties = blockblob_service.get_blob_properties(container_name, target_blob)
+
+            print('5. Get copy properties')
+            copy_properties = target_blob_properties.properties.copy
+            
+            print('Copy properties status: ' + copy_properties.status)
+
+            if(copy_properties.status == "pending"):
+                print('6. Abort copy')
+                blockblob_service.abort_copy_blob(container_name, blob_name, copy_properties.id)
+        finally:
+            # Delete the container
+            print("7. Delete Container")
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+
+    def sas_with_container_access_policy(self, account):
+        container_name = 'demosasblobcontainer' + self.random_data.get_random_name(6)
+        
+        blockblob_service = account.create_block_blob_service()
+        
+        try:
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+            
+            print('2. Create blob "blo1" with text')
+            blockblob_service.create_blob_from_text(container_name, 'blob1', b'hello world')
+
+            print('3. Set access policy for container')
+            # Set access policy on container
+            access_policy = AccessPolicy(permission=ContainerPermissions.READ,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            acl = blockblob_service.set_container_acl(container_name, identifiers)
+
+            # Wait 30 seconds for acl to propagate
+            print('Wait 30 seconds for acl to propagate')
+            time.sleep(30)
+
+            print('4. Get sas for access policy in container')
+            # Indicates to use the access policy set on the container
+            sas = blockblob_service.generate_container_shared_access_signature(
+                container_name,
+                id='id'
+            )
+
+            print('5. Create blob service with sas')
+            # Create a service and use the SAS
+            shared_blockblob_service = BlockBlobService(
+                account_name=account.account_name,
+                sas_token=sas,
+            )
+
+            print('6. Read blob content with sas')
+            blob = shared_blockblob_service.get_blob_to_text(container_name, 'blob1')
+            content = blob.content # hello world
+        finally:
+            print('7. Delete container')
+            blockblob_service.delete_container(container_name)
+        
+        print("SAS with access policy sample completed")
+        
+    def container_operations_with_sas(self, account):
+        container_name = 'demosasblobcontainer' + self.random_data.get_random_name(6)
+        
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+        
+        # Create a Shared Access Signature for the account
+        print('1.Get account sas')
+        
+        account_sas = blockblob_service.generate_account_shared_access_signature(
+            ResourceTypes.CONTAINER + ResourceTypes.OBJECT, 
+            AccountPermissions.READ + AccountPermissions.WRITE + AccountPermissions.DELETE + AccountPermissions.LIST + AccountPermissions.CREATE, 
+            datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+
+        shared_account = CloudStorageAccount(account_name=account.account_name, sas_token=account_sas)
+        shared_account_block_service = shared_account.create_block_blob_service()
+
+        try:
+            print('2. Create container with account sas. Container name - ' + container_name)
+            shared_account_block_service.create_container(container_name)
+            
+            # For the purposes of the demo, get a Container SAS
+            # In a real-world application, the above Account SAS can be used
+            print('3. Get container sas')
+            container_sas = blockblob_service.generate_container_shared_access_signature(
+                container_name, 
+                ContainerPermissions.READ + ContainerPermissions.WRITE + ContainerPermissions.DELETE + ContainerPermissions.LIST, 
+                datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            
+            shared_container_account = CloudStorageAccount(account_name=account.account_name, sas_token=container_sas)
+            shared_container_block_service = shared_container_account.create_block_blob_service()
+            
+            print('4. Create blob with container sas')
+            shared_container_block_service.create_blob_from_text(container_name, 'myblob', 'blob data')
+            
+            print('5. List blobs with container sas')
+            blobs = shared_container_block_service.list_blobs(container_name)
+            for blob in blobs:
+                print('blob ' + blob.name)
+            
+            print('6. Delete blob with container sas')
+            shared_container_block_service.delete_blob(container_name, 'myblob')
+        finally:            
+            print('7. Delete container')
+            blockblob_service.delete_container(container_name)
+            
+        print("Containers Sas sample completed")
+        
+    def list_containers(self, account):
+        
+        container_prefix = 'blockblobcontainers' + self.random_data.get_random_name(6)
+        
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        try:
+            # Create containers
+            for i in range(5):
+                container_name = container_prefix + str(i)
+                print('1. Create a container with name - ' + container_name)
+                blockblob_service.create_container(container_name)
+            
+            # List all the blobs in the container 
+            print('2. List containers with prefix ' + container_prefix)
+            containers = blockblob_service.list_containers(container_prefix)
+            for container in containers:
+                print('\tContainer Name: ' + container.name)
+        finally:
+            # Delete the containers
+            print("3. Delete Containers")
+            for i in range(5):
+                container_name = container_prefix + str(i)
+                if blockblob_service.exists(container_name):
+                    blockblob_service.delete_container(container_name)
+            
+        print("Containers sample completed")
+
+    def container_acl_operations(self, account):
+        
+        container_name = 'aclblockblobcontainer' + self.random_data.get_random_name(6)
+        
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        try:
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+                
+            print('2. Set access policy for container')
+            access_policy = AccessPolicy(permission=ContainerPermissions.READ,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            blockblob_service.set_container_acl(container_name, identifiers)
+
+            print('3. Get access policy from container')
+            acl = blockblob_service.get_container_acl(container_name)
+
+            print('4. Clear access policy in container')
+            # Clear
+            blockblob_service.set_container_acl(container_name)
+
+        finally:            
+            print('5. Delete container')
+            blockblob_service.delete_container(container_name)
+            
+        print("Container ACL operations sample completed")
+        
+    def properties_and_metadata_operations(self, account):
+        file_blob_name = "HelloWorld.png"
+        text_blob_name = "Text"
+         
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        container_name = 'blockblobbasicscontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name and custom metadata - ' + container_name)
+            blockblob_service.create_container(container_name, {'sample':'azure-storage'})
+                    
+            # Upload file as a block blob
+            print('2. Uploading BlockBlob from file with properties and custom metadata')
+            #Get full path on drive to file_to_upload by joining the fully qualified directory name and file name on the local drive
+            full_path_to_file = os.path.join(os.path.dirname(__file__), file_blob_name)
+            
+            blockblob_service.create_blob_from_path(container_name, file_blob_name, full_path_to_file, 
+                content_settings=ContentSettings(content_type='application/png'),
+                metadata={'category':'azure-samples'})
+            
+            blockblob_service.create_blob_from_text(container_name, text_blob_name, 'Data',
+                content_settings=ContentSettings(content_encoding ='UTF-8', content_language='en'),
+                metadata={'origin':'usa', 'title': 'azure-samples'})
+            
+            # Get all the container properties 
+            print('3. Get Container metadata')
+
+            container = blockblob_service.get_container_properties(container_name)
+            
+            print('    Metadata:')
+
+            for key in container.metadata:
+                print('        ' + key + ':' + container.metadata[key])
+            
+            # Get all the blob properties 
+            print('4. Get Blob properties')
+            blob = blockblob_service.get_blob_properties(container_name, file_blob_name)
+            
+            print('    Metadata:')
+            for key in blob.metadata:
+                print('        ' + key + ':' + blob.metadata[key])
+            
+            print('    Properties:')
+            print('        Content-Type:' + blob.properties.content_settings.content_type)
+        finally:            
+            # Delete the container
+            print("5. Delete Container")
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+        
+    # Set CORS
+    def set_cors_rules(self, account):
+
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+        
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+        
+        print('1. Get Cors Rules')
+        original_cors_rules =  blockblob_service.get_blob_service_properties().cors;
+        
+        try:
+            print('2. Overwrite Cors Rules')
+            blockblob_service.set_blob_service_properties(cors=[cors_rule])
+        finally:        
+            print('3. Revert Cors Rules back the original ones')
+            #reverting cors rules back to the original ones
+            blockblob_service.set_blob_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+
+    # Lease Container
+    def lease_container(self, account):
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+        
+        try:
+            container_name = 'blockblobcontainer' + self.random_data.get_random_name(6)
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+
+            print('2. Acquire lease on container')
+            lease_id = blockblob_service.acquire_container_lease(container_name, lease_duration=15)
+
+            print("3. Deleted container without lease")
+            try:
+                blockblob_service.delete_container(container_name)
+            except:
+                print('Got expected exception. Cannot delete container, lease not specified')
+        finally:
+            print("4. Delete container with lease")
+            blockblob_service.delete_container(container_name, lease_id=lease_id)
+
+        print("Lease container sample completed")
+
+    # Lease Blob
+    def lease_blob(self, account):
+        blob_name = "exclusive"
+        
+        # Create an block blob service object
+        blockblob_service = account.create_block_blob_service()
+        container_name = 'blobcontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+                    
+            # Create a block blob
+            print('2. Create Block Blob')
+            blob = self.random_data.get_random_bytes(255)
+            blockblob_service.create_blob_from_bytes(container_name, blob_name, blob)
+            
+            print('3. Acquire lease on blob')
+            lease_id = blockblob_service.acquire_blob_lease(container_name, blob_name, lease_duration=15)
+            
+            # Write to a block blob
+            print('4. Try to write to Block Blob without lease')
+            block_id = self.random_data.get_random_name(32)
+            block = self.random_data.get_random_bytes(255)
+            try:
+                blockblob_service.put_block(container_name, blob_name, block, block_id)
+            except:
+                print('Got expected exception. Cannot write blob, lease not specified')
+
+            print('5. Write to Block Blob with lease')
+            blockblob_service.put_block(container_name, blob_name, block, block_id, lease_id=lease_id)
+
+            print("6. Deleted blob without lease")
+            try:
+                blockblob_service.delete_blob(container_name, blob_name)
+            except:
+                print('Got expected exception. Cannot delete blob, lease not specified')
+
+            print("7. Delete blob with lease")
+            blockblob_service.delete_blob(container_name, blob_name, lease_id=lease_id)
+        finally:
+            print("8. Delete container")
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+
+        print("Lease blob sample completed")
+        
+    #Page Blob Operations
+    def page_blob_operations(self, account):
+        file_to_upload = "HelloWorld.png"
+        page_size = 1024;
+        
+        # Create an page blob service object
+        pageblob_service = account.create_page_blob_service()
+        container_name = 'pageblobcontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            pageblob_service.create_container(container_name)
+            
+            # Create a new page blob to upload the file
+            print('2. Create a page blob')
+            pageblob_service.create_blob(container_name, file_to_upload, page_size * 1024)
+            
+            # Read the file
+            print('3. Upload pages to page blob')
+            index = 0
+            with open(file_to_upload, "rb") as file:
+                file_bytes = file.read(page_size)
+                while len(file_bytes) > 0:
+                    if len(file_bytes) < page_size:
+                        file_bytes = bytes(file_bytes + bytearray(page_size - len(file_bytes)))
+                        
+                    pageblob_service.update_page(container_name, file_to_upload, file_bytes, index * page_size, index * page_size + page_size - 1)
+                    
+                    file_bytes = file.read(page_size)
+                    
+                    index = index + 1
+            
+            pages = pageblob_service.get_page_ranges(container_name, file_to_upload)
+            
+            print('4. Enumerate pages in page blob')
+            for page in pages:
+                print('Page ' + str(page.start) + ' - ' + str(page.end))
+        finally:
+            print('5. Delete container')
+            if pageblob_service.exists(container_name):
+                pageblob_service.delete_container(container_name)
+
+    #Block Blob Operations
+    def block_blob_operations(self, account):
+        file_to_upload = "HelloWorld.png"
+        block_size = 1024
+        
+        # Create an page blob service object
+        blockblob_service = account.create_block_blob_service()
+        container_name = 'blockblobcontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+            
+            blocks = []
+            
+            # Read the file
+            print('2. Upload file to block blob')
+            with open(file_to_upload, "rb") as file:
+                file_bytes = file.read(block_size)
+                while len(file_bytes) > 0:
+                    block_id = self.random_data.get_random_name(32) 
+                    blockblob_service.put_block(container_name, file_to_upload, file_bytes, block_id)                    
+                    
+                    blocks.append(BlobBlock(id=block_id))
+                    
+                    file_bytes = file.read(block_size)
+            
+            blockblob_service.put_block_list(container_name, file_to_upload, blocks)
+            
+            print('3. Get the block list')
+            blockslist = blockblob_service.get_block_list(container_name, file_to_upload, None, 'all')
+            blocks = blockslist.committed_blocks
+
+            print('4. Enumerate blocks in block blob')
+            for block in blocks:
+                print('Block ' + block.id)
+        finally:
+            print('5. Delete container')
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+
+    # Manage properties of the Blob service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, account):
+
+        # Create an page blob service object
+        blockblob_service = account.create_block_blob_service()
+
+        print('1. Get Blob service properties')
+        props = blockblob_service.get_blob_service_properties();
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        logging = Logging(delete=True, read=False, write=True, retention_policy=retention)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite Blob service properties')
+            blockblob_service.set_blob_service_properties(logging=logging, hour_metrics=hour_metrics, minute_metrics=minute_metrics, target_version='2015-04-05')
+        finally:
+            print('3. Revert Blob service properties back to the original ones')
+            blockblob_service.set_blob_service_properties(logging=props.logging, hour_metrics=props.hour_metrics, minute_metrics=props.minute_metrics, target_version='2015-04-05')
+
+        print('4. Set Blob service properties completed')

+ 130 - 0
Azure/table-service.py

@@ -0,0 +1,130 @@
+import requests
+import config
+from azure import storage
+from PackageInformationWorker.PyPIPackageInformation import PyPIPackageInformation
+import json
+import azure.storage.queue as queue
+import traceback
+import urllib
+import logging
+
+logger = logging.getLogger()
+account_name = config.STORAGE_ACCOUNT_NAME
+account_key = config.STORAGE_ACCOUNT_KEY
+STATIC_ROW_KEY = 'ROWKEY'
+table_service = storage.CloudStorageAccount(account_name, account_key).create_table_service()
+table_service.create_table(config.PACKAGE_VERSION_DATA_TABLENAME)
+table_service.create_table(config.PACKAGE_SUMMARY_TABLENAME)
+
+def main():
+    # package, version = ('azure', '1.0.0')
+    # get a package to look at
+    # check that package and version.
+    # version data just gets filled in
+    # summary trickier.
+    # summary -> name,
+    #               first_published (might be different than python2_start if
+    #               not using trove classifier)
+    #               python2_start (change if we find earlier),
+    #               python2_end (change if we find earlier, remove if package
+    #               after this come in and has python2),
+    #               python3_start (change if we find earlier)
+    try:
+        qs = queue.QueueService(config.STORAGE_ACCOUNT_NAME, config.STORAGE_ACCOUNT_KEY)
+        messages_in_batch = 5
+
+        while True:
+            messages = qs.get_messages(config.PACKAGE_QUEUE_NAME,numofmessages=messages_in_batch, visibilitytimeout=messages_in_batch*60)
+            for message in messages:
+                entity = json.loads(message.message_text)
+                _process_one_package(entity["package"], entity["version"])
+                # once completed delete the message
+                qs.delete_message(config.PACKAGE_QUEUE_NAME, message.message_id, message.pop_receipt)
+    except Exception as e:
+        # swallow exception here. we will just reprocess and delete the message.
+        # known failures:
+        # - connection aborted by get_messages sometimes.  this happens with a connectionreseterror (10054)
+        # - Random json errors. Could add retry.  
+        logger.error(traceback.format_exc())
+          
+def _process_one_package(package_name, version):
+    logger.info("Worker: Package:{} Version:{}".format(package_name, version))
+    if not package_name or not version:
+        logger.warn("Package_name or version was empty. Moving on as the queue had bad data")
+        return
+
+    # .6684 seconds to run.  74577 total packages
+    package_info = PyPIPackageInformation.get_package_specific_version_info(package_name, version)
+    if not package_info:
+        logger.error("Worker: Package:{} Version:{} failed to get package info".format(package_name, version))
+        return
+
+    supports_python_2 = len([x for x in package_info['classifiers'] if x.startswith('Programming Language :: Python :: 2')]) > 0
+    supports_python_3 = len([x for x in package_info['classifiers'] if x.startswith('Programming Language :: Python :: 3')]) > 0
+    uploaded = package_info['uploaded']
+
+    try:
+        summary_entity = table_service.get_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY)
+    except:
+        # we don't have a summary for this entry.
+        summary_entity = { 
+            'PartitionKey':package_name, 'RowKey':STATIC_ROW_KEY, 'First_Published':None, 
+            'Python2_Start':None, 'Python2_End':None, 'Python3_Start':None
+            }
+        table_service.insert_or_replace_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY, summary_entity)
+        summary_entity = table_service.get_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY)
+
+    # set fields using upload. Upload is none if the version has never been uploaded
+    # Basically just filter out packages that never have content from our records.
+    if uploaded is not None:
+        if not hasattr(summary_entity, 'First_Published') or summary_entity.First_Published is None or summary_entity.First_Published > uploaded:
+            # if the published date is empty or later than the current release we
+            # are viewing update
+            summary_entity.First_Published = uploaded
+
+        if supports_python_2 and \
+            (not hasattr(summary_entity, 'Python2_Start') or summary_entity.Python2_Start is None or summary_entity.Python2_Start > uploaded):
+            # if the published date is empty or later than the date and it supports
+            # python 2
+            summary_entity.Python2_Start = uploaded
+    
+        if supports_python_2 and hasattr(summary_entity, 'Python2_End') and summary_entity.Python2_End is not None and summary_entity.Python2_End < uploaded:
+            # we support python2 but it is after the date we thought python 2
+            # support ended we must not have really ended
+            summary_entity.Python2_End = None    
+        elif hasattr(summary_entity, 'Python2_Start') and hasattr(summary_entity, 'Python2_End') and \
+            summary_entity.Python2_Start is not None and summary_entity.Python2_End is not None and \
+            (summary_entity.Python2_End > uploaded and summary_entity.Python2_Start < uploaded):
+            # if we don't support python2, and we have started supporting python2
+            # at some point
+            # and if the date we are saying we ended is after the start
+            summary_entity.Python2_End = uploaded
+
+        if supports_python_3 and \
+            (not hasattr(summary_entity, 'Python3_Start') or summary_entity.Python3_Start is None or summary_entity.Python3_Start > uploaded):
+            # if the published date is empty or later than the current release we
+            # are viewing update
+            summary_entity.Python3_Start = uploaded
+
+    version_entity = _insert_entity_to_package_version_table(package_name, version, supports_python_2, supports_python_3, package_info['downloads'], uploaded)
+    summary_entity = table_service.insert_or_replace_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY, summary_entity)
+
+def _insert_entity_to_package_version_table(package, version, python2, python3, downloads, upload_time):
+    # TODO: issue with python azure storage.  Version can't have '~' in it. https://github.com/Azure/azure-storage-python/issues/76
+    package_sanitized = urllib.parse.quote_plus(package)
+    version_sanitized = urllib.parse.quote_plus(version)
+
+    try:
+        result =  table_service.insert_or_replace_entity(config.PACKAGE_VERSION_DATA_TABLENAME, package_sanitized, version_sanitized,
+                                    {'PartitionKey' : package_sanitized,
+                                     'RowKey': version_sanitized, 
+                                     'Python2': python2, 
+                                     'Python3': python3,
+                                     'Downloads': downloads,
+                                     'UploadTime': upload_time})
+
+        return result
+    except Exception as e:
+        logger.error("Failed to insert Package:{} Version:{} Python2:{} Python3:{} Downloads:{} UploadTime:{} Exception:{}".format(
+            package, version, python2, python3, downloads, upload_time, traceback.format_exc()))
+        raise e

+ 218 - 0
Azure/table-storage.py

@@ -0,0 +1,218 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+import config
+import datetime
+import time
+from random_data import RandomData
+from tablestorageaccount import TableStorageAccount
+from azure.storage import CloudStorageAccount, AccessPolicy
+from azure.storage.table import TableService, Entity, TablePermissions
+from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
+
+#
+# Azure Table Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure Table Service
+# including creating a table, CRUD operations and different querying techniques.
+#
+# Documentation References:
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+#  - Getting Started with Tables - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-table-storage/
+#  - Table Service Concepts - http://msdn.microsoft.com/en-us/library/dd179463.aspx
+#  - Table Service REST API - http://msdn.microsoft.com/en-us/library/dd179423.aspx
+#  - Table Service Python API - http://azure.github.io/azure-storage-python/ref/azure.storage.table.html
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#
+class TableAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Table service.
+    def run_all_samples(self, account):
+        table_service = account.create_table_service()
+        print('Azure Storage Advanced Table samples - Starting.')
+        
+        print('\n\n* List tables *\n')
+        self.list_tables(table_service)
+        
+        if not account.is_azure_cosmosdb_table():
+           print('\n\n* Set service properties *\n')
+           self.set_service_properties(table_service)
+        
+           print('\n\n* Set Cors rules *\n')
+           self.set_cors_rules(table_service)
+        
+           print('\n\n* ACL operations *\n')
+           self.table_acl_operations(table_service)
+        
+        if (config.IS_EMULATED):
+            print('\n\n* Shared Access Signature is not supported in emulator *\n')
+        else:
+            print('\n\n* SAS operations *\n')
+            self.table_operations_with_sas(account)
+
+        print('\nAzure Storage Advanced Table samples - Completed.\n')
+
+    # Manage tables including creating, listing and deleting
+    def list_tables(self, table_service):
+        table_prefix = 'table' + self.random_data.get_random_name(6)
+
+        try:        
+            # Create tables
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                print('1. Create a table with name - ' + table_name)
+                table_service.create_table(table_name)
+            
+            # List all the tables 
+            print('2. List tables')
+            tables = table_service.list_tables()
+            for table in tables:
+                print('\Table Name: ' + table.name)
+
+        finally:
+            # Delete the tables
+            print("3. Delete Tables")
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                if(table_service.exists(table_name)):
+                    table_service.delete_table(table_name)
+            
+        print("List tables sample completed")
+    
+    # Manage properties of the Table service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, table_service):
+        print('1. Get Table service properties')
+        props = table_service.get_table_service_properties()
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        logging = Logging(delete=True, read=False, write=True, retention_policy=retention)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite Table service properties')
+            table_service.set_table_service_properties(logging=logging, hour_metrics=hour_metrics, minute_metrics=minute_metrics)
+
+        finally:
+            print('3. Revert Table service properties back to the original ones')
+            table_service.set_table_service_properties(logging=props.logging, hour_metrics=props.hour_metrics, minute_metrics=props.minute_metrics)
+
+        print('4. Set Table service properties completed')
+    
+    # Manage CORS rules on the table service
+    def set_cors_rules(self, table_service):
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+        
+        print('1. Get Cors Rules')
+        original_cors_rules = table_service.get_table_service_properties().cors
+
+        try:        
+            print('2. Overwrite Cors Rules')
+            table_service.set_table_service_properties(cors=[cors_rule])
+
+        finally:
+            #reverting cors rules back to the original ones
+            print('3. Revert Cors Rules back the original ones')
+            table_service.set_table_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+
+    # Manage table access policy
+    def table_acl_operations(self, table_service):
+        table_name = 'acltable' + self.random_data.get_random_name(6)
+
+        try:        
+            print('1. Create a table with name - ' + table_name)
+            table_service.create_table(table_name)
+                
+            print('2. Set access policy for table')
+            access_policy = AccessPolicy(permission=TablePermissions.QUERY,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            table_service.set_table_acl(table_name, identifiers)
+
+            print('3. Wait 30 seconds for acl to propagate')
+            time.sleep(30)
+
+            print('4. Get access policy from table')
+            acl = table_service.get_table_acl(table_name)
+
+            print('5. Clear access policy in table')
+            table_service.set_table_acl(table_name)
+
+        finally:
+            print('5. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table ACL operations sample completed")
+    
+    # Manage shared access signature on a table
+    def table_operations_with_sas(self, account):
+        table_name = 'sastable' + self.random_data.get_random_name(6)
+        
+        try:
+            # Create a Table Service object
+            table_service = account.create_table_service()
+            
+            print('1. Create table with name - ' + table_name)
+            table_service.create_table(table_name)
+            
+            # Create a Shared Access Signature for the table
+            print('2. Get sas for table')
+            
+            table_sas = table_service.generate_table_shared_access_signature(
+                table_name, 
+                TablePermissions.QUERY + TablePermissions.ADD + TablePermissions.UPDATE + TablePermissions.DELETE, 
+                datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+
+            shared_account = TableStorageAccount(account_name=account.account_name, sas_token=table_sas, endpoint_suffix=account.endpoint_suffix)
+            shared_table_service = shared_account.create_table_service()
+
+            # Create a sample entity to insert into the table
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '555-555-5555'}
+
+            # Insert the entity into the table
+            print('3. Insert new entity into table with sas - ' + table_name)
+            shared_table_service.insert_entity(table_name, customer)
+            
+            # Demonstrate how to query the entity
+            print('4. Read the inserted entity with sas.')
+            entity = shared_table_service.get_entity(table_name, 'Harp', '1')
+            
+            print(entity['email'])
+            print(entity['phone'])
+
+            # Demonstrate how to update the entity by changing the phone number
+            print('5. Update an existing entity by changing the phone number with sas')
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '425-123-1234'}
+            shared_table_service.update_entity(table_name, customer)
+
+            # Demonstrate how to delete an entity
+            print('6. Delete the entity with sas')
+            shared_table_service.delete_entity(table_name, 'Harp', '1')
+
+        finally:
+            print('7. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table operations with sas completed")

+ 47 - 0
BI/BIL.py

@@ -0,0 +1,47 @@
+import numpy as np
+import pandas as pd 
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+warnings.filterwarnings('ignore')
+
+data=pd.read_csv('D:/Ajay/input/Suicide.csv')
+
+data=data.drop(['HDI for year','country-year'],axis=1)                  #dropping these two columns
+
+#-----Table------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+grouop_data=data.groupby(['age','sex'])['suicides_no'].sum().unstack()  #collecting data and making table using 'unstack()' function
+grouop_data=grouop_data.reset_index().melt(id_vars='age')               #arranging according to age
+grouop_data_female=grouop_data.iloc[:6,:]                               #retrieving 6 rows using 'iloc' function
+print("\n--Table of Suicides according to Female Age Groups--\n")
+from IPython.display import display
+display(grouop_data_female)                                             #displaying table
+print("\n")
+
+#-----Country vs. suicide_no-------------------------------------------------------------------------------------------------------------------------------------------
+
+suicidesNo=[]
+for country in data.country.unique():                                   
+    suicidesNo.append(sum(data[data['country']==country].suicides_no))  #getting total no of suicides of all countries
+
+suicidesNo=pd.DataFrame(suicidesNo,columns=['suicides_no'])             
+country=pd.DataFrame(data.country.unique(),columns=['country'])
+data_suicide_countr=pd.concat([suicidesNo,country],axis=1)              #definind data and axis to plot
+
+data_suicide_countr=data_suicide_countr.sort_values(by='suicides_no',ascending=False)#displaying plot in descending order(i.e. from highest no. of suicides to lowest)
+
+sns.barplot(y=data_suicide_countr.country[:20],x=data_suicide_countr.suicides_no[:20])  #displaying bars of only 20 countries with highest no. of suicides
+plt.title("20 Countries with Higest Suicide Number from 1985 to 2016")
+plt.show()
+
+#-----Population vs. Age_group-----------------------------------------------------------------------------------------------------------------------------------------
+
+index_suicide=[]
+for age in data['age'].unique():
+    index_suicide.append(sum(data[data['age']==age].suicides_no)/len(data[data['age']==age].suicides_no))  #getting suicide rate of each age group
+    
+plt.bar(['5-14 years', '15-24 years', '25-34 years', '35-54 years', '55-74 years', '75+ years'],index_suicide,align='center',alpha=0.5) #defining xticks
+plt.xticks(rotation=45)                                                 #rotating xticks by 45 degree anticlockwise
+plt.title("Suicide rates of Different Age Groups")
+plt.show()

+ 1 - 0
BI/BusinessIntelligence-Kaggle

@@ -0,0 +1 @@
+Subproject commit 06143b2ae0538affe8029950bf36597d253bcffd

+ 606 - 0
BI/ID3_classification.py

@@ -0,0 +1,606 @@
+# TODO mention in the report that with every level of the tree the data gets smaller and smaller.
+# NEXT STEPS
+# TODO: Create an infographic and host it on a web page.
+# TODO: Gather live data from news articles (Can try using NLTK & urllib).
+# TODO: Use Natural Language Processing to automate some of the data cleaning/integration.
+
+###################################################################################################################
+# Online Retail Analysis - ID3 CLASSIFICATION                                                                     #
+#    NOTE! Concepts will be explained with examples from the Street data set, which can be found below.           #
+#    The reason for this is because that data set is very small and easy to follow.                               #
+#                                                                                                                 #
+# 1) RESOURCES                                                                                                    #
+#    ID3 TUTORIALS:                                                                                               #
+#      1) https://sefiks.com/2017/11/20/a-step-by-step-id3-decision-tree-example/                                 #
+#      2) https://medium.com/coinmonks/what-is-entropy-and-why-information-gain-is-matter-4e85d46d2f01            #
+#                                                                                                                 #
+#    DECISION TREE TUTORIAL: https://www.lucidchart.com/pages/decision-tree                                       #
+#    ENTROPY (MORE DETAILS): https://en.wikipedia.org/wiki/Entropy_(information_theory)                           #
+#                                                                                                                 #
+# 2) DATA SETS                                                                                                    #
+#    TEST DATA SET: This data set can be found by navigating to the STREET DATA SET region in this file.          #
+#    It is a part of the ID3 file because I believe it would be useful to have an example of how the ID3 code     #
+#    works with a data set and also provides an opportunity to better understand what the code is doing.          #
+#    To have a look at ID3 applied to a small data set just make a call the test_run() function at the            #
+#    end of the file.                                                                                             #
+#                                                                                                                 #
+# 3) ALGORITHM OVERVIEW                                                                                           #
+#    Used to generate a decision tree from a given data set. It works by evaluating each attribute                #
+#    in the data set to place the nodes in an order that will return an accurate result.                          #
+#                                                                                                                 #
+# 4) USES                                                                                                         #
+#    A) Classify labeled data generally to do with NLP, approving loans and credit cards, etc.                    #
+#    B) Another non-standard use of this algorithm is to use it to fill a missing value in the data set           #
+#    during the pre-processing stage.                                                                             #
+#                                                                                                                 #
+###################################################################################################################
+
+import math
+import copy
+
+# region PERFORMANCE IMPROVEMENTS (for Python 3.8)
+""" 
+Applied: (TO DOCUMENT)
+
+TODO: 
+   1) Remove ever dict.keys() used and replace it with dict because dict.keys() creates a list of keys in memory. 
+      (More costly than looking through the dictionary itself! Further information below.)  
+      https://stackoverflow.com/questions/4730993/python-key-in-dict-keys-performance-for-large-dictionaries
+"""
+# endregion
+
+# region PLAY TENNIS DATA SET
+DATASET_BY_ATTRIB_DICT = {"outlook": ["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", "Overcast",
+                                      "Sunny", "Sunny", "Rain", "Sunny", "Overcast", "Overcast", "Rain"],
+                          "temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool",
+                                          "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
+                          "humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal",
+                                       "High", "Normal", "Normal", "Normal", "High", "Normal", "High"],
+                          "wind": ["Weak", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong",
+                                   "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Strong"]}
+
+
+# Answer as to whether or not it is a good time to play tennis.
+TARGET_ATTRIB_LIST = ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]
+
+# CONSTANT VARIABLES  # TODO: Optimise these variables by making them immutable (specifying they are const with Python)
+TARGET_ATTRIB_NAME = "play tennis"
+TRAIN_DATA_SIZE = len(TARGET_ATTRIB_LIST)
+# endregion
+
+
+# Represents a tree node and links to derived nodes.
+class Node:
+
+    def __init__(self, node_name, derived_nodes=[]):
+        self.node_name = node_name
+        self.derived_nodes = derived_nodes
+
+
+class ID3DecisionTree:
+    def __init__(self):
+        self.root_node = None
+
+        # Keeps track of all the nodes at the end of the branches that are available to link to.
+        # In this way, no code needs to be ran to find the next available space for a new node.
+        # The node at index 0 is always the one to add to first, once the new node is linked to it, it gets popped off
+        # and the new node gets appended to the end of this list.
+        self.active_branch_nodes = []
+
+        # TODO: Merge this list with the active_branch_nodes to be in dictionary format like so
+        # {attrib1: [outcome1, outcome2], attrib2: [outcome1, outcome2, outcome3]}
+        self.linked_attributes = []
+
+        # IMPORTANT NOTE:
+        # Key to understanding how the DecisionTree class works is understanding the dataset_occurrence_dict
+        # structure, as that is what is used for most calculations. This structure contains only the data from the
+        # dataset required to construct the tree. Any repetition of attribute data has been removed to reduce load.
+        # The 'dataset_occurrence_dict' structure is an unordered dictionary, where the structure itself gives more
+        # information about the dataset. For example, every attribute of the data set is a key, which contains
+        # a dictionary of its outcomes/possible values, and for each outcome, there is a dictionary showing the
+        # distribution of the outcomes for the selected target attribute.
+        # Example of dictionary structure below.
+        """ Example structure: (where 'AN'-attribute name; 'ON'-outcome name; 'TON'-target outcome name) 
+            dataset_occurrence_dict = {"AN 1": {"ON 1": {"TON 1": 1, "TON 2": 2},
+                                                "ON 2": {"TON 1": 0, "TON 2": 1},
+                                                "ON 3": {"TON 1": 0, "TON 2": 1}
+                                                },
+                                       "AN 2": {"ON 1": {"TON 1": 4, "TON 2": 0},
+                                                "ON 2": {"TON 1": 1, "TON 2": 0}
+                                                }
+                                       }
+                                                
+            The example above can be read, for attribute 1 - AN1, there are 3 outcomes - ON1, ON2, ON3. 
+            The target has 2 possible outcomes TON1 and TON2. Those values are being tracked/accounted for, 
+            for each possible outcome of each attribute. For AN1, ON1 there is 1 occurrence of TON1 and 2 occurrences of 
+            TON2. For AN1, ON2 there are 0 occurrences of TON1, and 1 occurrence of TON2 therefore the answer for this 
+            branch is TON2. Same for AN1, ON3 - answer TON2. If all the occurrences of TON1 and TON2 for attrib 1 (AN1)
+            are summed, we get the number of entries in the given data set. 
+        """
+        self.dataset_occurrence_dict = {}
+
+    # region BUILD TREE UTILITIES
+    """ Construct dataset distribution/occurrence dictionary - "dataset_occurrence_dict".
+    PARAMETERS
+      :param (dict) dataset_by_attrib_dict
+      :param (list) target_list """
+    def generate_occurrences(self, dataset_by_attrib_dict, target_list):
+        # TODO: assert that all attribute lists have the same length
+
+        # Update the dictionary with each attribute
+        for attrib_name in dataset_by_attrib_dict.keys():
+            # STEP 1: ADD the current attribute to the 'dataset_occurrence_dict' structure
+            self.dataset_occurrence_dict.update({attrib_name: {}})
+
+            # STEP 2: Fetch a list containing only the unique data from attribute_list and target_list.
+            attribute_list = dataset_by_attrib_dict[attrib_name]
+            unique_attrib_outcomes = list(set(attribute_list))
+            unique_answers = list(set(target_list))
+
+            # For each unique outcome of the current attribute
+            for attrib_outcome in unique_attrib_outcomes:
+                #   2.1) Update dictionary to store the next attribute outcome
+                self.dataset_occurrence_dict[attrib_name].update({attrib_outcome: {}})
+                # print(self.dataset_occurrence_dict)
+
+                #   2.2) For the current attribute, look at each of its outcomes and add them onto the dictionary
+                for outcome in unique_answers:
+                    self.dataset_occurrence_dict[attrib_name][attrib_outcome].update({outcome: 0})
+                    # print(self.dataset_occurrence_dict)
+
+            # STEP 3: Goes through the dataset and counts the target outcome occurrences for each attribute occurrence
+            for itter in range(len(attribute_list)):
+                #   3.1) Fetch the current attribute outcome and the current target outcome from the dataset.
+                curr_attrib_occ = attribute_list[itter]
+                curr_target_occ = target_list[itter]
+
+                #   3.2) Update the count for the current target outcome in the current attribute outcome by 1
+                self.dataset_occurrence_dict[attrib_name][curr_attrib_occ][curr_target_occ] += 1
+
+    """ After a node is added to the tree the "dataset_occurrence_dict" dictionary should be updated.
+       PARAMETERS
+         :param (list) attrib_list - the raw attrib data from the dataset.
+         :param (list) target_list - the raw target data from the dataset. """
+    def get_next_branch_occurrences(self, dataset_by_attrib_dict, target_list):
+        # This is the outcome to update the dataset_occurrence_dict by
+
+        # A completely separate dictionary from the original, this dictionary will only hold a subdictionary
+        # of the original
+        subdict = copy.deepcopy(dataset_by_attrib_dict)
+        subtar = copy.deepcopy(target_list)
+
+        indices_to_remove = []
+        attrib_to_remove = None
+
+        # Looking through every possible attribute in the dictionary
+        for attrib_key in subdict:
+            attrib_found = False
+            # Count through each list of outcomes for the given attribute.
+            for count in range(len(subdict[attrib_key])):
+                # If the active outcome name is equal to the current outcome value in the list
+                if dataset_by_attrib_dict[attrib_key][count] == self.active_branch_nodes[0].node_name:
+                    attrib_found = True
+                    # According to the algorithm, the attribute containing the currently active outcome
+                    # should be removed
+                    if attrib_key in subdict:
+                        attrib_to_remove = attrib_key
+                else:
+                    indices_to_remove.append(count)
+                    # print(subdict[attrib_key][count])
+                    # subdict[attrib_key].pop(count)
+                    # TODO: assert that there is only one 0 in the list otherwise it is trying to remove the wrong values
+
+            if attrib_found:
+                break
+
+        # Processing the subdict data
+        #print("Subdict: ", subdict)
+        del subdict[attrib_to_remove]
+
+        for attrib in subdict:
+            #print("Discarding data in ", attrib)
+            complete_list = subdict[attrib]
+
+            sublist = [value for index, value in enumerate(complete_list) if index not in indices_to_remove]
+            subdict[attrib] = sublist
+
+        #print("After processing the data: ", subdict)
+
+        # Processing the subtar data
+        #print("Discarding data in target list")
+        #print("Target data before processing: ", subtar)
+        # print(indices_to_remove)
+        subtar = [value for index, value in enumerate(subtar) if index not in indices_to_remove]
+        #print("Target data after processing: ", subtar)
+
+        # TODO: Call this function recursively on each branch, pass in the shrinked dictionary
+        # TODO: test the base case thoroughly
+        # TODO: Build a new dataset_by_attrib_dict for the current outcome
+        # TODO: REMOVE outlook from the dataset dict when all its outcomes have children nodes assigned
+        # (How to know if an attribute is complete???)
+
+        return subdict, subtar
+
+    """ Checks if a branch is complete, i.e. the target outcome was found. 
+    PARAMETERS
+      :param  (dict) target_val_dist_for_attrib 
+      :returns (list) comp_branches - contains all the target outcomes reached for the given attribute."""
+    def track_target_outcomes(self, target_val_dist_for_attrib):
+        comp_branches = []
+
+        # Looks through each attribute outcome
+        for attrib_outcome_key in target_val_dist_for_attrib.keys():
+
+            # Tracks how many non-zero occurrences of a target outcome there are for this attribute outcome.
+            non_zero_outcome_count = 0
+
+            # This variable is set to the target outcome if the branch outcome is (100%) certain.
+            branch_answer = None
+
+            # Checks what the distribution of target outcomes is for the current attribute outcome.
+            # Ex: question - how sdo people drive based on the terrain, if the terrain is flat do they drive slow
+            # or fast, and what is it if the terrain is steep.
+            # Target outcomes - fast and slow; attrib outcomes - flat and steep.
+            # Distribution dictionary looks like this ->{'fast': {'slow': 0, 'fast': 1}, 'steep':{'slow': 2, 'fast': 1}}
+            for target_outcome_key in target_val_dist_for_attrib[attrib_outcome_key].keys():
+
+                # Fetch the number of occurrences for each target outcome for the current attribute
+                """"Another Example: if the target is can_buy_computer(possible values/outcomes: Yes or No) and the current 
+                attribute is age (possible values/outcomes:  <=30, 31..40 and >40) this will return how many of the entries 
+                where age is <=30 are no, then how many of the entries where age is <=30 are yes, then how many 
+                of the entries where age is 31..40 are yes and so on, until all cases are looked at. """
+                outcome_occurrences = target_val_dist_for_attrib[attrib_outcome_key][target_outcome_key]
+
+                # Check if the answer is certain and end the branch, i.e. count how many branches have
+                # certain target outcome
+                if outcome_occurrences > 0:
+                    non_zero_outcome_count += 1
+
+                    if non_zero_outcome_count == 1:
+                        branch_answer = target_outcome_key
+
+            if non_zero_outcome_count == 0:
+                print("INVALID RESULT!")
+            elif non_zero_outcome_count == 1:
+                print("THE ANSWER FOR <<", attrib_outcome_key, ">> is <<", branch_answer, ">>")
+                comp_branches.append({attrib_outcome_key: branch_answer})
+            elif non_zero_outcome_count > 1:
+                print("THE BRANCH <<", attrib_outcome_key, ">> IS STILL ACTIVE!")
+
+        return comp_branches
+
+    # Counts the occurrences of each value for a given attribute.
+    def count_value_occ(self, unique_values, attrib_data):
+        attrib_val_occ = {}
+
+        # Construct dictionary
+        for value in unique_values:
+            attrib_val_occ.update({value: 0})
+
+        # Initialise Dictionary
+        for u_value in unique_values:
+            attrib_val_occ[u_value] = attrib_data.count(u_value)
+
+        return attrib_val_occ
+
+    def calc_entropy(self, attrib_uv_count, overall):
+        entropy = 0
+        # print("UV: ", attrib_uv_count)
+
+        for key in attrib_uv_count.keys():
+
+            # if there is some occurrence of the value calculate entropy,
+            # otherwise ignore it (when there is 0 occurrences of the value)
+            if attrib_uv_count[key] != 0:
+                fraction = attrib_uv_count[key] / overall
+                target_attrib_calc = fraction * math.log2(fraction)
+
+                entropy += target_attrib_calc
+
+        return abs(entropy)
+
+    def calc_attrib_entropy(self, attrib_occurrences):
+        entropy_list = {}
+
+        for attrib_val_key in attrib_occurrences.keys():
+            attrib_val = attrib_occurrences[attrib_val_key]
+            overall = 0
+            for target_values in attrib_val.values():
+                overall += target_values
+
+            print("CALC TARGET ENTROPY FOR EACH ATTRIB OUTCOME: ", attrib_val)
+            attrib_entropy = self.calc_entropy(attrib_val, overall)
+            entropy_list.update({attrib_val_key: attrib_entropy})
+
+        print("Entropy list: ", entropy_list)
+
+        return entropy_list
+
+    # WEIGHTED AVERAGE ENTROPY for the children
+    def calc_entropy_weigh_avg(self, target_val_dist_attrib, overall, attrib_entropy):
+        weighted_entropy_avg = 0
+        for key in target_val_dist_attrib.keys():
+            curr_value = 0
+
+            for value in target_val_dist_attrib[key].values():
+                curr_value += value
+            weighted_entropy_avg += curr_value / overall * attrib_entropy[key]
+            # overall += curr_value
+
+        return weighted_entropy_avg
+
+    def calc_info_gain(self, target_entropy, target_dist_for_attrib):
+
+        # CALCULATE ENTROPY OF Attribute
+        attrib_entropy = self.calc_attrib_entropy(target_dist_for_attrib)
+        # print("Attrib Entropy: ", attrib_entropy)
+
+        weighted_avg_e = self.calc_entropy_weigh_avg(target_dist_for_attrib, TRAIN_DATA_SIZE, attrib_entropy)
+        # print("Attrib Weighted AVG: ", weighted_avg_e)
+
+        attrib_info_gain = target_entropy - weighted_avg_e
+
+        return attrib_info_gain
+
+    # IMPORTANT NOTE: An attribute node should always be made together with its outcomes, never an outcome alone
+    # as it is not how this function was setup.
+    # :param (str) name - should always be the name of an attribute.
+    def build_node(self, name, completed_branches):
+        attrib_node = Node(name)
+        derived_nodes = []
+
+        completed_outcomes = []
+        for branch in completed_branches:
+            completed_outcomes.append(list(branch.keys())[0])
+
+        # if all outcome branches for thi attribute are completed, then the attribute is complete and its outcomes
+        # should be popped off the active_branch_nodes list
+        # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> CHECK COMPLETE ATTRIB: ", completed_branches)
+
+        # print(self.dataset_occurrence_dict[name].keys())
+        for outcome_name in self.dataset_occurrence_dict[name]:
+            new_outcome_node = Node(outcome_name)
+            # print("STATUS: NEW OUTCOME NODE CREATED")
+
+            # Check if the branch for the current outcome is complete (Target answer is 100% certain).
+            for branch in completed_branches:
+                if outcome_name in branch:
+                    # print("FOUND OUTCOME <<", outcome_name, ">> in ", branch)
+
+                    if len(new_outcome_node.derived_nodes) == 0:
+                        # Formally end the node
+                        endpoint_node = Node(branch[outcome_name], None)
+                        new_outcome_node.derived_nodes.append(endpoint_node)
+                        # print("STATUS: NEW OUTCOME ENDPOINT NODE CREATED & LINKED")
+
+            # The temp_outcome node is created so that the outcome node stored in the tree and the outcome node stored
+            # in the active_branch_nodes list are the same. This is important because I never append directly onto the
+            # tree but to a reference of the active branch of the tree. This allows to append to any depth of the tree
+            # without needing to do any traversal to find the next available node.
+            temp_outcome = copy.deepcopy(new_outcome_node)
+            derived_nodes.append(temp_outcome)
+
+            # If the branch is still active/available to add to
+            if outcome_name not in completed_outcomes:
+                # Add the new node to the active branch list
+                self.active_branch_nodes.append(temp_outcome)
+            """print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Completed Nodes:", acc_completed)
+        acc_completed[name]["completed"] = True
+        all_outcomes_list = list(self.dataset_occurrence_dict[name].keys())
+
+        for outcome in all_outcomes_list:
+                if outcome in acc_completed[name]["outcomes"]:
+                    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ", outcome, " TRUE")
+                else:
+                    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>> ", outcome, " FALSE")
+                    acc_completed[name]["completed"] = False
+
+            print(all_outcomes_list)"""
+
+            new_outcome_node.derived_nodes.clear()
+
+        # print("STATUS: NEW NODE CREATED")
+        attrib_node.derived_nodes = derived_nodes
+        return attrib_node
+
+    # IMPORTANT NODE: active_branch_nodes is only updated when build_node function is called, therefore
+    # the link will not be appropriate unless the node was created through the build_node function.
+    def link_node(self, new_node):
+        """
+        print("  <<< CHECKING IF THE TREE SEGMENT IS BUILT RIGHT! >>>    ")
+        # TEMP
+        print("ATTRIBUTE/PARENT NODE: ", new_node.node_name)
+        print("DERIVED NODES LIST: ", new_node.derived_nodes)
+
+        print("FOR EACH NODE IN DERIVED NODES.")
+        for node in new_node.derived_nodes:
+            print("\t OUTCOME NODE FOR ATTRIB: ", node.node_name)
+            for other in node.derived_nodes:
+                print("\t\t TARGET OUTCOME REACHED: ", other.node_name)"""
+        if self.root_node is None:
+            self.root_node = new_node
+
+        else:
+            # Add the new node to the tree
+            # I hard coded 0 as the active node index because index 0 is always the next available node to link to.
+            self.active_branch_nodes[0].derived_nodes.append(new_node)
+
+            # Update the available nodes!
+            # The node at index 0 is already taken so that node should be popped off
+            self.active_branch_nodes.pop(0)
+
+    # Builds a part of the tree (attribute node with setup derived nodes/outcome nodes) and links it to the tree.
+    def build_tree_chunk(self, dataset_by_attrib_dict, target_attrib_list):
+        self.generate_occurrences(dataset_by_attrib_dict, target_attrib_list)
+        # print("Main DICTIONARY", self.dataset_occurrence_dict)
+
+        # TARGET ATTRIBUTE CALCULATIONS - Required for the calculation of info_gain for the rest of the attributes.
+        target_uv_data = list(set(target_attrib_list))  # TODO: POSSIBLE EFFICIENCY DECREASE
+        target_uv_count = self.count_value_occ(target_uv_data, target_attrib_list)
+        # print("Target Unique Value Count: ", target_uv_count)
+
+        target_entropy = self.calc_entropy(target_uv_count, TRAIN_DATA_SIZE)
+        # print("TARGET ENTROPY: ", target_entropy)
+
+        # Build each node(calc its entropy and info_gain, and assigning each attributes outcomes as children)
+        # store the node in the node list and sort the nodes by info_gain to build the tree with them.
+        next_node_data = {"name": None, "info gain": 0, "completed": None}
+
+        for attrib_name in self.dataset_occurrence_dict.keys():
+            print("\n", "-" * 50)
+
+            # ATTRIB CALCULATIONS
+            print("attrib_name: ", attrib_name)
+
+            # Contains a data structure representing the target attribute's value distribution
+            # with regard to another attribute
+            target_dist_for_attrib = self.dataset_occurrence_dict[attrib_name]
+            # print("Target occurrences: ", target_dist_for_attrib)
+
+            # Check if any of the branches is completed
+            completed_branches = self.track_target_outcomes(target_dist_for_attrib)
+            print("COMPLETED BRANCHES: ", completed_branches)
+
+            attrib_info_gain = self.calc_info_gain(target_entropy, target_dist_for_attrib)
+            # print("The INFO GAIN for <<", attrib_name, ">> is ", attrib_info_gain)
+
+            if next_node_data["info gain"] < attrib_info_gain:
+                next_node_data["name"] = attrib_name
+                next_node_data["info gain"] = attrib_info_gain
+                next_node_data["completed"] = completed_branches
+
+        print("------> The next new node is: ", next_node_data["name"], "\n\n")
+        new_node = self.build_node(next_node_data["name"], next_node_data["completed"])
+        self.link_node(new_node)
+
+    # endregion
+
+    def build_tree(self, dataset_by_attrib_dict, target_attrib_list):
+
+        self.build_tree_chunk(dataset_by_attrib_dict, target_attrib_list)
+        print("\n\n")
+
+        while len(self.active_branch_nodes) != 0:
+            print(">>>>>>>>>>>>>>>>>>> Current active node: ", self.active_branch_nodes[0].node_name)
+            # self.linked_attrib_names
+            sub_attrib_dict, sub_tar_list = self.get_next_branch_occurrences(dataset_by_attrib_dict, target_attrib_list)
+            self.build_tree_chunk(sub_attrib_dict, sub_tar_list)
+            print("\n\n>>>>>>>>>>>>>>>>>>> List of active nodes: ", self.active_branch_nodes)
+
+        print("\n\n", "<"*5, "THE TREE IS COMPLETE!", ">"*5, "\n\n")
+
+    def visualise_tree(self):
+        current_node = self.root_node
+        while current_node is not None:
+            print(current_node.node_name)
+
+            # TODO this recursively, base case -> len(node.derived_nodes) == 0
+            # EXTRA TODO pass in a variable called branch_track that will start off as "",
+            # each time a recursion is spawned add a "\t", that way the print will have a sort of a hiearchy
+
+    # This function runs classification on one entry and returns the answer.
+    # Should only be called after the tree model was built.
+    def classify(self, entry_index, dataset_by_attrib_dict):
+        answer = None
+
+        # TODO: assert that root node is not none
+        current_node = self.root_node
+
+        while current_node.derived_nodes is not None:
+            print("\n  <<< TRAVERSING TREE >>>  ")
+            print("Current Attrib: ", current_node.node_name)
+
+            # Ask the tree which attribute/column to look for first
+            column_name = current_node.node_name
+
+            # Fetch the value for the given entry (entry_index) from the column identified by the tree.
+            current_outcome_name = dataset_by_attrib_dict[column_name][entry_index]
+            print("\tCurrent outcome name: ", current_outcome_name)
+
+            # Get that node from the derived nodes list
+            for outcome_node in current_node.derived_nodes:
+                if outcome_node.node_name == current_outcome_name:
+                    # print("\n  <<< TRAVERSING TREE >>>  ")
+                    # print("FOUND VALUE FOR ENTRY <<", entry_index, ">>  ->  <<", outcome_node.node_name, ">>")
+                    current_node = outcome_node.derived_nodes[0]
+                    # print("Current Attrib: ", current_node.node_name)
+                    answer = current_node.node_name
+
+        print("    <<< FOUND VALUE >>>  ")
+        print("    The answer is: ", answer)
+
+        return answer
+
+
+def test_run_algorithm():
+    print(" "*10, " << ID3 CLASSIFICATION ALGORITHM >> ", " "*10)
+
+    tree = ID3DecisionTree()
+    tree.build_tree(DATASET_BY_ATTRIB_DICT, TARGET_ATTRIB_LIST)
+
+    # APPLY CLASSIFICATION
+    # The index of the entry in the dataset.
+    entry_index = 0
+    tree.classify(entry_index, DATASET_BY_ATTRIB_DICT)
+
+
+test_run_algorithm()
+
+"""
+# Remove the completed branches
+for branch in completed_branches:
+    for key in branch.keys():
+        target_val_dist_for_grade.pop(key)
+
+print("After removing completed branches: ", target_val_dist_for_grade)
+"""
+
+# region Build Decision Tree
+
+# endregion
+
+""" 
+What is "Training Data"? 
+    Building the tree is done with training data which already has the answer to whatever question is being asked. 
+    the example given with the data on the slides that asks if someone can buy a laptop is training data
+    because it already knows the answer.
+"""
+"""
+Apply information gain function to each attribute calculate_gain(attr_out)
+Should that be applied to the target as well? No
+Example:
+    - G(train_data, O) = 0.246
+    - G(train_data, H) = 0.151
+    - G(train_data, W) = 0.048
+
+Once the root node is known, look at how many unique values are there.
+If there are 4 possible values and they are not numbers, 
+for example "Sunny", "Rainy", etc. there should be 4 nodes. 
+"""
+
+# region Apply Classification
+"""
+What is "Test Data"?
+    Test data is when we get a new entry and we want to classify it. 
+    For example: In the bank they may use an already trained ID3 algorithm to check if you should get a credit card or not.
+    They will have different attributes like - number of times you have gone bankrupt; what is your current net worth; 
+    are you a student;  what is your credit score; etc.
+    Then the target attribute will be EligibleForCreditCard(True or False)
+"""
+
+# Use the built decision tree to look through a row of data from the data set. This is done using test data.
+# (How to evaluate if the classification has an error?)
+""" 
+Steps: 
+    1. Find which is the current attribute to look through (To start with ask the tree which attribute is the root node)
+        1.1 (When building the tree need to make sure the attributes have the exact same name as the Node data)
+        1.2 Search trough all possible attributes
+        1.3 Check if the attribute name == the node name
+        
+    2. Find the attribute value for the current row
+        2.1 Ask the data set which value is given for this attribute
+        2.2 Find the which of the children nodes in the tree are equivalent to the given value
+        
+    Repeat these steps recursively until an answer is found. 
+"""
+# endregion

+ 336 - 0
BI/Practica2.py

@@ -0,0 +1,336 @@
+# -*- coding: utf-8 -*-
+
+"""
+Autor:
+    Francisco Solano López Rodríguez
+Fecha:
+    Noviembre/2018
+Contenido:
+    Practica 2 Clustering
+    Inteligencia de Negocio
+    Grado en Ingeniería Informática
+    Universidad de Granada
+"""
+
+import time
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+
+from sklearn.cluster import KMeans
+from sklearn.cluster import KMeans, AgglomerativeClustering,estimate_bandwidth
+from sklearn.cluster import Birch,SpectralClustering,MeanShift,DBSCAN, MiniBatchKMeans
+from sklearn import metrics
+from sklearn import preprocessing
+from math import floor
+import seaborn as sns
+from scipy.cluster.hierarchy import dendrogram,ward
+
+seed = 12345
+
+################### FUNCIONES ###########################
+
+def getPrediction(algorithm, X):
+    t = time.time()
+    cluster_predict = algorithm.fit_predict(X) 
+    tiempo = time.time() - t
+
+    return cluster_predict, tiempo
+
+# Función para obtener las medias de cada cluster
+def getMeans(dataFrame):
+    return dataFrame.groupby("cluster").mean()
+
+# Función para obtener las desviaciones de cada cluster
+def getStd(dataFrame):
+    return dataFrame.groupby("cluster").std()
+
+# Función para pintar Scatter Matrix 
+def DrawScatterMatrix(data, name=None, display=True, save=False):
+    sns.set()
+    variables = list(data)
+    variables.remove('cluster')
+    sns_plot = sns.pairplot(data, vars=variables, hue="cluster", palette='Paired', plot_kws={"s": 25},
+                            diag_kind="hist") 
+    sns_plot.fig.subplots_adjust(wspace=.03, hspace=.03)
+
+    if name != None:        
+        plt.title("scatter_"+name)
+
+    # Mostrar imagen por pantalla
+    if display:
+        plt.show()
+
+    # Guardar imagen en memoria
+    if save:        
+        if name == None:
+            name = "_unknown_"
+        image_name = "scatter/scatter_" + name + ".png"
+        plt.savefig(image_name)
+        plt.clf()
+        print("Imagen guardada: ", image_name)
+
+# Función para pintar heatmap
+def DrawHeatmap(data, name = None, display=True, save = False):
+    data_normal = data.apply(norm_to_zero_one)
+    meanDF = getMeans(dataFrame = data_normal)
+    hm = sns.heatmap(data=meanDF, linewidths=.1, cmap="Blues", annot=True, xticklabels='auto')
+    plt.xticks(rotation=0)
+    plt.title("heatmap_"+name)
+
+    if name != None:        
+        plt.title("heatmap_"+name)
+
+    # Mostrar imagen por pantalla
+    if display:
+        plt.show()
+
+    # Guardar imagen en memoria
+    if save:
+        if name == None:
+            name = "_unknown_"
+        image_name = "heatmap/heatmap_" + name + ".png"
+        plt.savefig(image_name)
+        plt.clf()
+        print("Imagen guardada: ", image_name)
+
+# Función para pintar dendograma
+def DrawDendrogram(data, name = None, display=True, save = False):
+    data_normal = preprocessing.normalize(data,norm='l2')
+    linkage_array = ward(X_normal)
+
+    dendrogram(linkage_array,leaf_rotation=90., leaf_font_size=5.)
+    
+    if name != None:        
+        plt.title("dendograma_" + name)
+
+    # Mostrar imagen por pantalla
+    if display:
+        plt.show()
+
+    # Guardar imagen en memoria
+    if save:
+        if name == None:
+            name = "_unknown_"
+        image_name = "dendrogram/dendrogram_" + name + ".png"
+        plt.savefig(image_name)
+        plt.clf()
+        print("Imagen guardada: ", image_name)
+
+def dataFrameResultados(algoritmos, num_cluster, metrics_CH, metrics_SC, tiempos):
+    df_algo = pd.DataFrame(algoritmos, columns=['Algoritmo'])
+    df_nc = pd.DataFrame(num_cluster, columns=['Num. Clusters'])
+    df_CH = pd.DataFrame(metrics_CH, columns=['CH'])
+    df_SC = pd.DataFrame(metrics_SC, columns=['SH'])
+    df_t = pd.DataFrame(tiempos, columns=['Tiempo'])
+
+    resultados = pd.concat([df_algo, df_nc, df_CH, df_SC, df_t], axis=1)
+
+    return resultados
+
+def norm_to_zero_one(df):
+    return (df - df.min()) * 1.0 / (df.max() - df.min())
+
+
+def executeClustering(algorithms, X, caso):
+
+    f = open("caso_" + str(caso) + ".txt", 'w')
+
+    X_normal = X.apply(norm_to_zero_one)
+
+    names = []
+    num_cluster = []
+    metrics_CH = []
+    metrics_SC = []
+    tiempos = []
+
+    print("\nCaso de estudio ", caso, ", tamaño: ", len(X))
+    f.write("\nCaso de estudio " + str(caso) + ", tamaño: " + str(len(X)))
+
+    for algorithm, name_algorithm in algorithms:
+
+        print("\n----------------------------------------\n")
+        print("Ejecutando algoritmo: ", name_algorithm, "\n")
+        f.write("\n--------------------------------------\n")
+        f.write("Ejecutando algoritmo: " + name_algorithm + "\n")        
+        # Ejecución algoritmo clustering
+        cluster_predict, tiempo = getPrediction(algorithm, X_normal)
+
+        # Pasar las predicciones a dataFrame
+        clusters = pd.DataFrame(cluster_predict,index=X.index,columns=['cluster'])
+
+        print("Tamaño de cada cluster:")
+        f.write("\nTamaño de cada cluster:\n")
+        size=clusters['cluster'].value_counts()
+
+        for num,i in size.iteritems():
+           print('%s: %5d (%5.2f%%)' % (num,i,100*i/len(clusters)))
+           f.write('%s: %5d (%5.2f%%)\n' % (num,i,100*i/len(clusters)))
+        print()
+
+        # Obtener los resultados de las métricas
+        metric_CH = metrics.calinski_harabaz_score(X_normal, cluster_predict)
+        metric_SC = metrics.silhouette_score(X_normal, cluster_predict, metric='euclidean', 
+                                         sample_size=floor(0.2*len(X)), random_state=seed)
+
+        # Guardamos el nombre del algoritmo, número de cluster, 
+        # los tiempos y las métricas para la posterior comparacion 
+        names.append(name_algorithm)   
+        num_cluster.append(len(set(cluster_predict)))
+        metrics_CH.append(metric_CH)
+        metrics_SC.append(metric_SC)
+        tiempos.append(tiempo)
+
+        # Se añade la asignación de clusters como columna a X
+        X_cluster = pd.concat([X, clusters], axis=1)
+        X_normal_cluster = pd.concat([X_normal, clusters], axis=1)
+
+        name = "caso_" + str(caso) + "_" + name_algorithm  
+
+        # Pintamos el scatter matrix
+        DrawScatterMatrix(data = X_cluster, name = name, display = False, save = True)
+
+        # Pintamos el heatmap
+        DrawHeatmap(data = X_cluster, name = name, display = False, save = True)
+
+        # DataFrame con la media de cada característica en cada cluster
+        meanDF = getMeans(dataFrame = X_cluster)
+        print()
+        print(meanDF)
+        f.write(meanDF.to_string())
+
+        # Si el algoritmo es AgglomerativeClustering pintamos el dendograma
+        if name_algorithm == 'AC':
+            DrawDendrogram(data = X_cluster, name = name, display = False, save = True)
+
+
+    resultados = dataFrameResultados(names, num_cluster, metrics_CH, metrics_SC, tiempos)
+
+    print("\n**************************************\n")
+    print(resultados.to_string())
+    print("\n**************************************\n")
+
+    f.write("\n**************************************\n")
+    f.write(resultados.to_string())
+    f.write("\n**************************************\n")
+
+    f.close()
+
+
+#########################################################
+
+# Lectura datos
+
+print("Leyendo el conjunto de datos...")
+censo = pd.read_csv('censo_granada.csv')
+censo = censo.replace(np.NaN,0) 
+print("Lectura completada.")
+
+
+###### CASOS DE ESTUDIO ######
+
+#-------- CASO 1 --------
+
+casado = 2
+hombre = 1
+mujer = 6
+
+subset = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
+usadas = ['EDAD', 'NPFAM', 'HM5', 'H0515']
+X = subset[usadas]
+X_normal = preprocessing.normalize(X, norm='l2')
+
+#-------- CASO 2 --------
+
+subset_2 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==hombre)]
+usadas_2 = ['EDAD', 'NPFAM', 'HM5', 'H0515']
+X_2 = subset_2[usadas_2]
+X_normal_2 = X_2.apply(norm_to_zero_one)
+
+#-------- CASO 3 --------
+
+subset_3 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
+usadas_3 = ['EDAD', 'NPFAM', 'NHIJOS', 'ESREAL']
+X_3 = subset_3[usadas_3]
+X_normal_3 = X_3.apply(norm_to_zero_one)
+
+###############################
+
+# Obtener la correlación entre las variables
+'''
+correlation = X.corr()
+sns.heatmap(correlation, square = True)
+plt.show()
+'''
+
+#################### Algoritmos #####################
+
+random_seed = 123
+
+k_means = KMeans(init='k-means++', n_clusters=5, n_init=5, random_state=random_seed)
+
+agglo=AgglomerativeClustering(n_clusters=5,linkage="ward")
+
+meanshift = MeanShift(bin_seeding=True)
+
+miniBatchKMeans = MiniBatchKMeans(init='k-means++',n_clusters=4, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
+
+dbscan = DBSCAN(eps=0.2)
+
+dbscan2 = DBSCAN(eps=0.1)
+
+algorithms = [(k_means, "KMeans"),
+              (agglo, "AC"),
+              (meanshift, "MeanShift"), 
+              (miniBatchKMeans, "MiniBatchKM"),
+              (dbscan, "DBSCAN")]
+
+algorithms2 = [(k_means, "KMeans"),
+              (agglo, "AC"),
+              (meanshift, "MeanShift"), 
+              (miniBatchKMeans, "MiniBatchKM"),
+              (dbscan2, "DBSCAN2")]
+
+
+# Kmeans con diferentes números de cluster
+
+algorithm_kmeans = []
+
+for i in range(5,9):
+    kmeans_i = KMeans(init='k-means++', n_clusters=i, n_init=5)
+    algorithm_kmeans.append((kmeans_i, "KMeans_" + str(i)))
+
+# AgglomerativeClustering con diferentes números de cluster
+
+algorithm_AC = []
+
+for i in range(5,9):
+    agglo_i = AgglomerativeClustering(n_clusters=i,linkage="ward")
+    algorithm_AC.append((agglo_i, "AC_" + str(i)))
+
+# MiniBatchKmeans con diferentes números de cluster
+
+algorithm_miniBatch = []
+
+for i in range(5,9):
+    miniBatch_i = MiniBatchKMeans(init='k-means++',n_clusters=i, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
+    algorithm_miniBatch.append((miniBatch_i, "MiniBatchKM_" + str(i)))
+
+#-----------------------------------------------------#
+
+# EJECUCIÓN CASO 1
+executeClustering(algorithms, X, 1)
+executeClustering(algorithm_kmeans, X, 1.1)
+executeClustering(algorithm_AC, X, 1.2)
+
+# EJECUCIÓN CASO 2
+executeClustering(algorithms, X_2, 2)
+executeClustering(algorithm_kmeans, X_2, 2.1)
+executeClustering(algorithm_miniBatch, X_2, 2.2)
+
+# EJECUCIÓN CASO 3
+executeClustering(algorithms2, X_3, 3)
+executeClustering(algorithm_kmeans, X_3, 3.1)
+executeClustering(algorithm_miniBatch, X_3, 3.2)
+

+ 132 - 0
BI/apriori.py

@@ -0,0 +1,132 @@
+# author: Justin Cui
+# date: 2019/10/23
+# email: 321923502@qq.com
+
+
+from numpy import *
+
+
+def load_data():
+    dataSet = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
+               ['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
+               ['socks', 'gloves'],
+               ['bread', 'milk', 'shoes', 'socks', 'eggs'],
+               ['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
+               ['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
+    return dataSet
+
+
+# 扫描全部数据,产生c1
+def create_c1(data):
+    c1 = []
+    for transaction in data:
+        for item in transaction:
+            if [item] not in c1:
+                c1.append([item])
+    c1.sort()
+    return list(map(frozenset, c1))
+
+
+# 由c(i)生成对应的l(i)
+def c2l(data, ck, min_support):
+    dict_sup = {}
+    for i in data:
+        for j in ck:
+            if j.issubset(i):
+                if j not in dict_sup:
+                    dict_sup[j] = 1
+                else:
+                    dict_sup[j] += 1
+    support_data = {}
+    result_list = []
+    for i in dict_sup:
+        temp_sup = dict_sup[i] / len(data)
+        if temp_sup >= min_support:
+            result_list.append(i)
+            support_data[i] = temp_sup
+    return result_list, support_data
+
+
+# 由l(k-1)生成c(k)
+def get_next_c(Lk, k):
+    result_list = []
+    len_lk = len(Lk)
+    for i in range(len_lk):
+        for j in range(i + 1, len_lk):
+            l1 = list(Lk[i])[:k]
+            l2 = list(Lk[j])[:k]
+            if l1 == l2:
+                a = Lk[i] | Lk[j]
+                a1 = list(a)
+                b = []
+                for q in range(len(a1)):
+                    t = [a1[q]]
+                    tt = frozenset(set(a1) - set(t))
+                    b.append(tt)
+                t = 0
+                for w in b:
+                    if w in Lk:
+                        t += 1
+                if t == len(b):
+                    result_list.append(b[0] | b[1])
+    return result_list
+
+
+# 得到所有的l集
+def get_all_l(data_set, min_support):
+    c1 = create_c1(data_set)
+    data = list(map(set, data_set))
+    L1, support_data = c2l(data, c1, min_support)
+    L = [L1]
+    k = 2
+    while (len(L[k - 2]) > 0):
+        Ck = get_next_c(L[k - 2], k - 2)
+        Lk, sup = c2l(data, Ck, min_support)
+        support_data.update(sup)
+        L.append(Lk)
+        k += 1
+    del L[-1]
+    return L, support_data
+
+
+# 得到所有L集的子集
+def get_subset(from_list, result_list):
+    for i in range(len(from_list)):
+        t = [from_list[i]]
+        tt = frozenset(set(from_list) - set(t))
+        if tt not in result_list:
+            result_list.append(tt)
+            tt = list(tt)
+            if len(tt) > 1:
+                get_subset(tt, result_list)
+
+
+# 计算置信度
+def calc_conf(freqSet, H, supportData, min_conf):
+    for conseq in H:
+        conf = supportData[freqSet] / supportData[freqSet - conseq]
+        lift = supportData[freqSet] / (supportData[conseq] * supportData[freqSet - conseq])
+        if conf >= min_conf and lift > 1:
+            print(set(freqSet - conseq), '-->', set(conseq), '支持度', round(supportData[freqSet - conseq], 2), '置信度:',
+                  conf)
+
+
+# 生成规则
+def gen_rule(L, support_data, min_conf=0.7):
+    for i in range(len(L)):
+        print("\n", i + 1, "-频繁项集为:")
+        for freqSet in L[i]:
+            print(set(freqSet), end="  ")
+    print("\n")
+    for i in range(1, len(L)):
+        for freqSet in L[i]:
+            H1 = list(freqSet)
+            all_subset = []
+            get_subset(H1, all_subset)
+            calc_conf(freqSet, all_subset, support_data, min_conf)
+
+
+if __name__ == '__main__':
+    dataSet = load_data()
+    L, supportData = get_all_l(dataSet, 0.5)
+    gen_rule(L, supportData, 0.6)

+ 440 - 0
BI/bi_main.py

@@ -0,0 +1,440 @@
+"""
+This OOP is to do the BI Challenge
+"""
+from warnings import simplefilter
+simplefilter(action='ignore', category=FutureWarning)
+# %matplotlib inline
+from google.colab import files
+import pandas as pd
+import numpy as np
+# %reload_ext sql
+import sqlite3
+import seaborn as sns
+import matplotlib.pyplot as plt
+from plotly.offline import iplot
+import plotly.express as px
+
+pd.options.display.float_format = '{:.2f}'.format # uppress scientific notation
+# Declare your Github Repository address
+A_url='https://raw.githubusercontent.com/haensel-ams/recruitment_challenge/master/BI_201805/table_A_conversions.csv'
+B_url='https://raw.githubusercontent.com/haensel-ams/recruitment_challenge/master/BI_201805/table_B_attribution.csv'
+
+# The Extract class is to extract data from your Gihub Repos address
+class Extract():
+
+  def __init__(self,A_url,B_url):
+    print('\033[1m'+'Please, wait! I am extracting data from your Github Repository'+'\033[0m'+'\n...')
+    self.A_url=A_url
+    self.table_A_conversions=self.load_data(self.A_url)
+    self.B_url=B_url
+    self.table_B_attribution=self.load_data(self.B_url)
+    print('Data was successfully extracted!')
+  
+  def load_data(self,url):
+    self.data=pd.read_csv(url)
+    #display(self.data.head(3))
+    return self.data
+
+# The Transform class is to combine two different  extracted datasets and do the data cleansing
+# Also, to know the generanl informantion about KPIs
+class Transform():
+
+  def __init__(self,extract):
+    print('\033[1m'+'I am transforming the extracted data'+'\033[0m'+'\n...')
+    self.table_A_conversions=extract.table_A_conversions
+    self.table_B_attribution=extract.table_B_attribution
+    self.joined_tabs = self.combine_tab(self.table_A_conversions, self.table_B_attribution,'Conv_ID')
+    self.time_tab=self.cleaning_data(self.joined_tabs)
+    # self.infor_Data=self.get_infor(self.time_tab)
+    self.get_missing=self.check_missing(self.time_tab)
+    self.cleaned_tab=self.time_tab.dropna()
+    display(self.cleaned_tab.head(5))
+    self.infor_Data=self.get_infor(self.cleaned_tab)
+    self.more_infor=self.deep_infor(self.cleaned_tab)
+  
+  def deep_infor(self,data):
+    print('Total annual revenue: %d'%data['Revenue'].sum())
+    
+
+  def combine_tab(self,tab_1,tab_2,common_col):
+    print('I am combining two data into one and coverting the time format\n...')
+    self.data=pd.merge(tab_1, tab_2, on=common_col, how='outer')
+    # display(self.data.head(5))
+    return self.data
+
+  def cleaning_data(self,data):
+    data['Conv_Date']= pd.to_datetime(data['Conv_Date']) 
+    self.data=data
+    print('Data was completely transformed!')
+    return self.data
+
+  def get_infor(self,data):
+    print('\033[1m'+'General information:'+'\033[0m')
+    self.information=data.info()
+    print('\033[1m'+'Descriptive Statistics:'+'\033[0m')
+    # print(data.describe())
+    return self.information
+
+  def check_missing(self,data):
+    print('\033[1m'+ 'The number of missing values:'+'\033[0m')
+    self.miss_data=data.isnull().sum()
+    self.miss_rate=100*data.isnull().sum()/len(data)
+    self.mis_infor=pd.concat([self.miss_data, self.miss_rate], axis=1).reset_index()
+    self.mis_infor=self.mis_infor.rename(columns={0: 'Amounts', 1: 'Percentage'})
+    # print(self.mis_infor)
+    return self.miss_data
+
+# The Load class is to load the tranformed data to the database
+class  Load():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am loading the transformed data to my database'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab
+    self.connect=self.connect_database()
+    self.insert=self.insert_data(self.data)
+    
+  def connect_database(self):
+    print('I am trying to connect to my SQL database\n....')
+    self.connect= "%sql sqlite:///phuong_database.db"
+    print(self.connect,'connection is success!',sep='\n')
+    return self.connect
+
+  def insert_data(self,data):
+    print('I am loading the transformed data to my SQL Database\n....')
+    self.check ="%sql DROP TABLE IF EXISTS data"
+    self.insert="%sql PERSIST data"
+    self.list_table="%sql SELECT name FROM sqlite_master WHERE type='table'"
+    print(self.list_table)
+    self.data="%sql SELECT * FROM data LIMIT 3"
+    print(self.data)
+    print('Data was completely inserted into my SQL Database!')
+    return self.insert 
+
+# The EDA_Overview_KPI class is to generate a preliminary overview on the KPI
+class EDA_Overview_KPI():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am doing the Explanatory Data Analysis (EDA) process for Revenue KPIs'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Revenue','User_ID']]
+    self.by_kpi=self.group_data(self.data,'Conv_Date','Revenue','User_ID')
+    # display(self.by_kpi.head(3))
+    self.kpi_fig=self.plot_kpi(self.by_kpi)
+    self.sum_stat=self.get_infor(self.by_kpi,'Conv_Date','Revenue','User_ID')
+      
+    
+  def group_data(self,data,target,exp_1,exp_2):
+    self.num_target=len(data[target].unique())
+    print('The number of '+target+': %d'%self.num_target)
+    self.data=data.groupby([target]).agg({exp_1:'sum',exp_2:'count'})
+    return self.data
+
+  def plot_kpi(self,data):
+    self.name_column=self.data.columns
+    plt.figure(figsize=(15, 9))
+    for i,col in enumerate(self.name_column):
+        plt.subplot(2,1,i+1)
+        plt.plot(self.data[col],label=col)
+        plt.title('The changes in of the daily '+col +' over the time period',fontweight='bold',fontsize='12')
+        plt.legend()
+        plt.autoscale(enable=True, axis='both',tight=True)
+    plt.savefig('Overview_KPI.png')
+    files.download('Overview_KPI.png')
+    return self.name_column
+
+  def get_infor(self,data,target,exp_1,exp_2):
+    self.infor=display(self.data.head(8).T)
+    print('\033[1m'+'Desriptive Statistics of the Daily KPIs by '+ target +'\033[0m', self.data.describe(),sep='\n')
+    print('Date with the highest revenue:', self.data[exp_1].idxmax(axis = 0) )
+    print('Date with the lowest revenue:', self.data[exp_1].idxmin(axis = 0) )
+    print('Date with the highest number of users:', self.data[exp_2].idxmax(axis = 0) )
+    print('Date with the lowest number of users:', self.data[exp_2].idxmin(axis = 0) )
+    return self.infor
+
+# The EDA_KPI_Return class is to generate a preliminary overview on the return customer
+class EDA_KPI_Return():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am doing the Explanatory Data Analysis (EDA) process for User KPIs'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','User_ID']]
+    self.infor_user=self.get_infor(self.data,'User_ID')
+    self.by_user=self.group_data(self.data,'User_ID','Conv_Date')
+    display(self.by_user.head(8).T)
+    self.user_plot=self.plot_user(self.by_user,'Conv_Date')
+
+  def get_infor(self,data,exp):
+    self.num_user=data[exp].unique()
+    print('The number of users: %d'%len(self.num_user))
+    return self.num_user
+
+  def group_data(self,data,target,exp):
+    self.num_target=len(data[target].unique())
+    print('The number of '+target+': %d'%self.num_target)
+    self.data=data.groupby([target]).agg({exp:'count'})
+    # display(self.data.head(8).T)
+    print('\033[1m'+'Desriptive Statistics of the Daily KPIs by '+ target +'\033[0m', self.data.describe(),sep='\n')
+    return self.data
+
+  def plot_user(self,data,exp):
+    self.data=data.rename(columns={exp: 'The number of returns'})
+    self.ax=self.data.plot.hist(figsize=(15, 9),bins=1500,xlim=(1,20),color='#86bf91'
+                                ,title='The Frequence of return customer',grid=True)
+    self.ax.set_xlabel('The number of days')
+    plt.savefig('Customer_return.png')
+    files.download('Customer_return.png') 
+    return self.ax
+
+# The EDA_Static_Ren class is to explore the information about the total revenue per year
+class EDA_Static_Ren():
+  
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Channel','Revenue']]
+    display(self.data.head(3))
+    # self.infor_conver=self.get_infor(self.data)
+    self.by_ChanelRen=self.group_data(self.data,'Channel')
+    self.pie_ChanelRen=self.plot_pie(self.by_ChanelRen,'Revenue')
+
+  def plot_pie(self,data,target):
+    self.data=data
+    self.data['Total Conver'] = self.data.sum(axis=1)
+    self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
+    self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
+    self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
+    plt.savefig('channel_Static_Ren.jpg')
+    files.download('channel_Static_Ren.jpg') 
+    return self.data
+    
+
+  def get_infor(self,data):
+    self.conver_uni=self.data.User_ID.unique()
+    print('The number of conversions: %d'%len(self.conver_uni))
+    return self.conver_uni
+
+  def group_data(self,data,target):
+    print('I am grouping data by '+ target + '\n...')
+    self.data=data.groupby([target]).agg({'Revenue':'sum'})
+    self.data=self.data.T
+    display(self.data)
+    print('I am done! ')
+    return self.data    
+
+# The EDA_Static_User class is to generate information about the total annual number of visits
+class EDA_Static_User():
+  
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Channel','User_ID']] #'Conv_Date',
+    display(self.data.head(3))
+    # self.infor_conver=self.get_infor(self.data)
+    self.by_ChanelConv=self.group_data(self.data,'Channel')
+    self.pie_channelConv=self.plot_pie(self.by_ChanelConv,'User_ID')
+
+  def plot_pie(self,data,target):
+    self.data=data
+    self.data['Total Conver'] = self.data.sum(axis=1)
+    self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
+    self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
+    self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
+    plt.savefig('channel_Static_User.jpg')
+    files.download('channel_Static_User.jpg') 
+    return self.data
+    
+
+  def get_infor(self,data):
+    self.conver_uni=self.data.User_ID.unique()
+    print('The number of conversions: %d'%len(self.conver_uni))
+    return self.conver_uni
+
+  def group_data(self,data,target):
+    print('I am grouping data by '+ target + '\n...')
+    self.data=data.groupby([target]).agg({'User_ID':'count'})
+    self.data=self.data.T
+    display(self.data)
+    print('I am done! ')
+    return self.data    
+
+# The EDA_Static_Conversion is to generate the information about the total annual number of conversion
+class EDA_Static_Conversion():
+  
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Channel','Conv_ID','IHC_Conv']] #'Conv_Date',
+    display(self.data.head(3))
+    self.infor_conver=self.get_infor(self.data)
+    self.by_ChanelConv=self.group_data(self.data,'Channel','Conv_ID')
+    self.pie_channelConv=self.plot_pie(self.by_ChanelConv,'Conv_ID')
+
+
+  def get_infor(self,data):
+    self.conver_uni=self.data.Conv_ID.unique()
+    print('The number of conversions: %d'%len(self.conver_uni))
+    return self.conver_uni
+
+  def group_data(self,data,target,exp):
+    print('I am grouping data by '+ target + '\n...')
+    if data[exp].dtype=='object':
+      self.data=data.groupby([target]).agg({exp:'count'})
+    else:
+      self.data=data.groupby([target]).agg({exp:'sum'})
+    self.data=self.data.T
+    display(self.data)
+    print('I am done! ')
+    return self.data    
+
+  def plot_pie(self,data,target):
+    self.data=data
+    self.data['Total Conver'] = self.data.sum(axis=1)
+    self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
+    self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
+    self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
+    plt.savefig('channel_Conver.png')
+    files.download('channel_Conver.png') 
+    return self.data
+
+# The EDA_Channel_Revenue class is to analyze the impacts of the online marketing channels on 
+# the daily Revenue
+class EDA_Channel_Revenue():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am analyzing the influences of the online marketing channels on the daily revenue'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Channel','Revenue']]
+    self.by_DateChannel=self.group_data(self.data,'Conv_Date','Channel')
+    self.unstaked_data=self.unstack_data(self.by_DateChannel,'Revenue','bar')
+    self.plotted_data=self.plot_data(self.unstaked_data)   
+    self.exported_data=self.export_data(self.unstaked_data,'channel_revenue')
+
+  def group_data(self,data,target_1,target_2):
+    print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
+    self.data=data.groupby([target_1,target_2])#.agg({exp:'count'})
+    print('I am done! ')
+    return self.data    
+
+  def unstack_data(self,data,exp,kind):
+    print('I am unstacking data \n...')
+    data=data.sum()[exp].unstack(level=-1)
+    self.data=data
+    display(self.data.head(3))
+    print('Data were unstacked completely\n...')
+    return self.data
+
+  def plot_data(self,data):
+    self.data=data
+    print('I am visualizing the contribution of Top 5 Channels to the Daily Revenue\n...')
+    self.data['The Total'] = self.data.sum(axis=1)
+    self.data['The Rest']= self.data['The Total']-self.data[['A','G','H','I','B']].sum(axis=1)
+    self.xlim=('2017-03-01','2018-03-24')
+    self.ax =self.data[['A','G','H','I','B','The Rest']].plot.area(xlim=self.xlim, figsize=(12,8))
+    self.ax.set_xlabel('Date')
+    self.ax.set_ylabel('Revenue')
+    print(self.data['The Rest'].describe())
+    plt.savefig('channel_ren.png')
+    files.download('channel_ren.png') 
+    return self.data
+    
+  def export_data(self,data,title):
+    print('I am exporting data to the excel and csv files\n...')
+    data.to_excel(title+'.xlsx')
+    self.excel=files.download(title+'.xlsx')
+    data.to_csv(title+'.csv')
+    self.csv=files.download(title+'.csv')
+    return self.excel
+
+# The EDA_Channel_User class is to analyze the impacts of the online marketing channels on 
+# the daily number of users
+class EDA_Channel_User():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am analyzing the influences of the online marketing channels on the daily number of users'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Channel','User_ID']]
+    self.by_DateUser=self.group_data(self.data,'Conv_Date','Channel','User_ID')
+    self.unstaked_data=self.unstack_data(self.by_DateUser,'User_ID','bar')
+    #display(self.unstaked_data.head(3))
+    self.plotted_data=self.plot_data(self.unstaked_data)   
+    # self.exported_data=self.export_data(self.unstaked_data,'channel_num_user')
+
+  def group_data(self,data,target_1,target_2,exp):
+    print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
+    self.data=data.groupby([target_1,target_2])#.agg({exp:'count'})
+    print('I am done! ')
+    return self.data    
+
+  def unstack_data(self,data,exp,kind):
+    print('I am unstacking data \n...')
+    data=data.count()[exp].unstack(level=-1)
+    self.data=data
+    print('Data were unstacked completely\n...')
+    return self.data
+
+  def plot_data(self,data):
+    self.data=data
+    print('I am visualizing the contribution of Top 5 Channels to the Daily Number of Users\n...')
+    self.data['The Total'] = self.data.sum(axis=1)
+    self.data['The Rest'] = self.data['The Total'] - self.data[['A','G','H','I','B']].sum(axis=1)
+    self.xlim=('2017-03-01','2018-03-24')
+    self.ax =self.data[['A','G','H','I','B','The Rest']].plot.area(xlim=self.xlim, figsize=(12,8))
+    self.ax.set_xlabel('Date')
+    self.ax.set_ylabel('The number of Users')
+    plt.savefig('channel_user.png')
+    files.download('channel_user.png') 
+    return self.data
+    
+  def export_data(self,data,title):
+    print('I am exporting data to the excel and csv files\n...')
+    data.to_excel(title+'.xlsx')
+    self.excel=files.download(title+'.xlsx')
+    data.to_csv(title+'.csv')
+    self.csv=files.download(title+'.csv')
+    return self.excel
+
+# The EDA_channel_IHC class is to generate the changes in the daily IHC of Channels
+class EDA_channel_IHC():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Channel','IHC_Conv']] #'Conv_Date',
+    self.by_TimeChannel=self.group_data(self.data,'Conv_Date','Channel','IHC_Conv')
+    self.unstacked_data=self.unstack_data(self.by_TimeChannel,'IHC_Conv')
+    self.change_plot=self.plot_data(self.unstacked_data)
+
+  def plot_data(self,data):
+    self.data=data
+    # self.data['The Rest'] = self.data.sum(axis=1)
+    self.xlim=('2017-03-01','2018-03-24')
+    self.ylim=('0','550')
+    self.ax =self.data[['A','G','H','I','B']].plot.line(xlim=self.xlim,figsize=(12,8))
+    self.ax.set_xlabel('Date')
+    self.ax.set_ylabel('IHC_Conv')
+    plt.savefig('channel_IHC.png')
+    files.download('channel_IHC.png') 
+    return self.data
+
+  def group_data(self,data,target_1,target_2,exp):
+    print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
+    self.data=data.groupby([target_1,target_2])#.agg({exp:'sum'})
+    print('I am done! ')
+    return self.data    
+
+  def unstack_data(self,data,exp):
+    print('I am unstacking data \n...')
+    data=data.sum()[exp].unstack(level=-1)
+    self.data=data
+    print('Data were unstacked completely\n...')
+    return self.data
+
+
+class main():
+  Extract=Extract(A_url,B_url)
+  Transform=Transform(Extract)
+  Load=Load(Transform)
+  EDA_Overview_KPI=EDA_Overview_KPI(Transform)
+  EDA_Static_Ren=EDA_Static_Ren(Transform)
+  EDA_KPI_Return=EDA_KPI_Return(Transform)
+  EDA_Static_User=EDA_Static_User(Transform)
+  EDA_Static_Conversion=EDA_Static_Conversion(Transform)
+  EDA_Channel_Revenue=EDA_Channel_Revenue(Transform)
+  EDA_Channel_User=EDA_Channel_User(Transform)
+  EDA_channel_IHC=EDA_channel_IHC(Transform)
+ 
+
+if __name__=='__main__':
+  main()

+ 727 - 0
BI/cube-backup.py

@@ -0,0 +1,727 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long,unused-argument,ungrouped-imports
+"""A collection of ORM sqlalchemy models for Superset"""
+import json
+import logging
+import textwrap
+from contextlib import closing
+from copy import deepcopy
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
+
+import numpy
+import pandas as pd
+import sqlalchemy as sqla
+import sqlparse
+from flask import g, request
+from flask_appbuilder import Model
+from sqlalchemy import (
+    Boolean,
+    Column,
+    create_engine,
+    DateTime,
+    ForeignKey,
+    Integer,
+    MetaData,
+    String,
+    Table,
+    Text,
+)
+from sqlalchemy.engine import Dialect, Engine, url
+from sqlalchemy.engine.reflection import Inspector
+from sqlalchemy.engine.url import make_url, URL
+from sqlalchemy.ext.hybrid import hybrid_property
+from sqlalchemy.orm import relationship
+from sqlalchemy.pool import NullPool
+from sqlalchemy.schema import UniqueConstraint
+from sqlalchemy.sql import expression, Select
+from sqlalchemy_utils import EncryptedType
+
+from superset import app, db_engine_specs, is_feature_enabled, security_manager
+from superset.db_engine_specs.base import TimeGrain
+from superset.models.dashboard import Dashboard
+from superset.models.helpers import AuditMixinNullable, ImportMixin
+from superset.models.tags import DashboardUpdater, FavStarUpdater
+from superset.utils import cache as cache_util, core as utils
+
+config = app.config
+custom_password_store = config["SQLALCHEMY_CUSTOM_PASSWORD_STORE"]
+stats_logger = config["STATS_LOGGER"]
+log_query = config["QUERY_LOGGER"]
+metadata = Model.metadata  # pylint: disable=no-member
+logger = logging.getLogger(__name__)
+
+PASSWORD_MASK = "X" * 10
+DB_CONNECTION_MUTATOR = config["DB_CONNECTION_MUTATOR"]
+
+
+class Url(Model, AuditMixinNullable):
+    """Used for the short url feature"""
+
+    __tablename__ = "url"
+    id = Column(Integer, primary_key=True)
+    url = Column(Text)
+
+
+class KeyValue(Model):  # pylint: disable=too-few-public-methods
+
+    """Used for any type of key-value store"""
+
+    __tablename__ = "keyvalue"
+    id = Column(Integer, primary_key=True)
+    value = Column(Text, nullable=False)
+
+
+class CssTemplate(Model, AuditMixinNullable):
+
+    """CSS templates for dashboards"""
+
+    __tablename__ = "css_templates"
+    id = Column(Integer, primary_key=True)
+    template_name = Column(String(250))
+    css = Column(Text, default="")
+
+
+class Database(
+    Model, AuditMixinNullable, ImportMixin
+):  # pylint: disable=too-many-public-methods
+
+    """An ORM object that stores Database related information"""
+
+    __tablename__ = "dbs"
+    type = "table"
+    __table_args__ = (UniqueConstraint("database_name"),)
+
+    id = Column(Integer, primary_key=True)
+    verbose_name = Column(String(250), unique=True)
+    # short unique name, used in permissions
+    database_name = Column(String(250), unique=True, nullable=False)
+    sqlalchemy_uri = Column(String(1024), nullable=False)
+    password = Column(EncryptedType(String(1024), config["SECRET_KEY"]))
+    cache_timeout = Column(Integer)
+    select_as_create_table_as = Column(Boolean, default=False)
+    expose_in_sqllab = Column(Boolean, default=True)
+    allow_run_async = Column(Boolean, default=False)
+    allow_csv_upload = Column(Boolean, default=False)
+    allow_ctas = Column(Boolean, default=False)
+    allow_cvas = Column(Boolean, default=False)
+    allow_dml = Column(Boolean, default=False)
+    force_ctas_schema = Column(String(250))
+    allow_multi_schema_metadata_fetch = Column(  # pylint: disable=invalid-name
+        Boolean, default=False
+    )
+    extra = Column(
+        Text,
+        default=textwrap.dedent(
+            """\
+    {
+        "metadata_params": {},
+        "engine_params": {},
+        "metadata_cache_timeout": {},
+        "schemas_allowed_for_csv_upload": []
+    }
+    """
+        ),
+    )
+    encrypted_extra = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    impersonate_user = Column(Boolean, default=False)
+    server_cert = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    export_fields = [
+        "database_name",
+        "sqlalchemy_uri",
+        "cache_timeout",
+        "expose_in_sqllab",
+        "allow_run_async",
+        "allow_ctas",
+        "allow_cvas",
+        "allow_csv_upload",
+        "extra",
+    ]
+    export_children = ["tables"]
+
+    def __repr__(self) -> str:
+        return self.name
+
+    @property
+    def name(self) -> str:
+        return self.verbose_name if self.verbose_name else self.database_name
+
+    @property
+    def allows_subquery(self) -> bool:
+        return self.db_engine_spec.allows_subqueries
+
+    @property
+    def function_names(self) -> List[str]:
+        try:
+            return self.db_engine_spec.get_function_names(self)
+        except Exception as ex:  # pylint: disable=broad-except
+            # function_names property is used in bulk APIs and should not hard crash
+            # more info in: https://github.com/apache/incubator-superset/issues/9678
+            logger.error(
+                "Failed to fetch database function names with error: %s", str(ex)
+            )
+        return []
+
+    @property
+    def allows_cost_estimate(self) -> bool:
+        extra = self.get_extra()
+
+        database_version = extra.get("version")
+        cost_estimate_enabled: bool = extra.get("cost_estimate_enabled")  # type: ignore
+
+        return (
+            self.db_engine_spec.get_allow_cost_estimate(database_version)
+            and cost_estimate_enabled
+        )
+
+    @property
+    def allows_virtual_table_explore(self) -> bool:
+        extra = self.get_extra()
+
+        return bool(extra.get("allows_virtual_table_explore", True))
+
+    @property
+    def explore_database_id(self) -> int:
+        return self.get_extra().get("explore_database_id", self.id)
+
+    @property
+    def data(self) -> Dict[str, Any]:
+        return {
+            "id": self.id,
+            "name": self.database_name,
+            "backend": self.backend,
+            "allow_multi_schema_metadata_fetch": self.allow_multi_schema_metadata_fetch,
+            "allows_subquery": self.allows_subquery,
+            "allows_cost_estimate": self.allows_cost_estimate,
+            "allows_virtual_table_explore": self.allows_virtual_table_explore,
+            "explore_database_id": self.explore_database_id,
+        }
+
+    @property
+    def unique_name(self) -> str:
+        return self.database_name
+
+    @property
+    def url_object(self) -> URL:
+        return make_url(self.sqlalchemy_uri_decrypted)
+
+    @property
+    def backend(self) -> str:
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        return sqlalchemy_url.get_backend_name()  # pylint: disable=no-member
+
+    @property
+    def metadata_cache_timeout(self) -> Dict[str, Any]:
+        return self.get_extra().get("metadata_cache_timeout", {})
+
+    @property
+    def schema_cache_enabled(self) -> bool:
+        return "schema_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def schema_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("schema_cache_timeout")
+
+    @property
+    def table_cache_enabled(self) -> bool:
+        return "table_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def table_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("table_cache_timeout")
+
+    @property
+    def default_schemas(self) -> List[str]:
+        return self.get_extra().get("default_schemas", [])
+
+    @property
+    def connect_args(self) -> Dict[str, Any]:
+        return self.get_extra().get("engine_params", {}).get("connect_args", {})
+
+    @classmethod
+    def get_password_masked_url_from_uri(  # pylint: disable=invalid-name
+        cls, uri: str
+    ) -> URL:
+        sqlalchemy_url = make_url(uri)
+        return cls.get_password_masked_url(sqlalchemy_url)
+
+    @classmethod
+    def get_password_masked_url(
+        cls, url: URL  # pylint: disable=redefined-outer-name
+    ) -> URL:
+        url_copy = deepcopy(url)
+        if url_copy.password is not None:
+            url_copy.password = PASSWORD_MASK
+        return url_copy
+
+    def set_sqlalchemy_uri(self, uri: str) -> None:
+        conn = sqla.engine.url.make_url(uri.strip())
+        if conn.password != PASSWORD_MASK and not custom_password_store:
+            # do not over-write the password with the password mask
+            self.password = conn.password
+        conn.password = PASSWORD_MASK if conn.password else None
+        self.sqlalchemy_uri = str(conn)  # hides the password
+
+    def get_effective_user(
+        self,
+        url: URL,  # pylint: disable=redefined-outer-name
+        user_name: Optional[str] = None,
+    ) -> Optional[str]:
+        """
+        Get the effective user, especially during impersonation.
+        :param url: SQL Alchemy URL object
+        :param user_name: Default username
+        :return: The effective username
+        """
+        effective_username = None
+        if self.impersonate_user:
+            effective_username = url.username
+            if user_name:
+                effective_username = user_name
+            elif (
+                hasattr(g, "user")
+                and hasattr(g.user, "username")
+                and g.user.username is not None
+            ):
+                effective_username = g.user.username
+        return effective_username
+
+    @utils.memoized(watch=("impersonate_user", "sqlalchemy_uri_decrypted", "extra"))
+    def get_sqla_engine(
+        self,
+        schema: Optional[str] = None,
+        nullpool: bool = True,
+        user_name: Optional[str] = None,
+        source: Optional[utils.QuerySource] = None,
+    ) -> Engine:
+        extra = self.get_extra()
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        self.db_engine_spec.adjust_database_uri(sqlalchemy_url, schema)
+        effective_username = self.get_effective_user(sqlalchemy_url, user_name)
+        # If using MySQL or Presto for example, will set url.username
+        # If using Hive, will not do anything yet since that relies on a
+        # configuration parameter instead.
+        self.db_engine_spec.modify_url_for_impersonation(
+            sqlalchemy_url, self.impersonate_user, effective_username
+        )
+
+        masked_url = self.get_password_masked_url(sqlalchemy_url)
+        logger.debug("Database.get_sqla_engine(). Masked URL: %s", str(masked_url))
+
+        params = extra.get("engine_params", {})
+        if nullpool:
+            params["poolclass"] = NullPool
+
+        connect_args = params.get("connect_args", {})
+        configuration = connect_args.get("configuration", {})
+
+        # If using Hive, this will set hive.server2.proxy.user=$effective_username
+        configuration.update(
+            self.db_engine_spec.get_configuration_for_impersonation(
+                str(sqlalchemy_url), self.impersonate_user, effective_username
+            )
+        )
+        if configuration:
+            connect_args["configuration"] = configuration
+        if connect_args:
+            params["connect_args"] = connect_args
+
+        params.update(self.get_encrypted_extra())
+
+        if DB_CONNECTION_MUTATOR:
+            if not source and request and request.referrer:
+                if "/superset/dashboard/" in request.referrer:
+                    source = utils.QuerySource.DASHBOARD
+                elif "/superset/explore/" in request.referrer:
+                    source = utils.QuerySource.CHART
+                elif "/superset/sqllab/" in request.referrer:
+                    source = utils.QuerySource.SQL_LAB
+
+            sqlalchemy_url, params = DB_CONNECTION_MUTATOR(
+                sqlalchemy_url, params, effective_username, security_manager, source
+            )
+
+        return create_engine(sqlalchemy_url, **params)
+
+    def get_reserved_words(self) -> Set[str]:
+        return self.get_dialect().preparer.reserved_words
+
+    def get_quoter(self) -> Callable[[str, Any], str]:
+        return self.get_dialect().identifier_preparer.quote
+
+    def get_df(  # pylint: disable=too-many-locals
+        self,
+        sql: str,
+        schema: Optional[str] = None,
+        mutator: Optional[Callable[[pd.DataFrame], None]] = None,
+    ) -> pd.DataFrame:
+        sqls = [str(s).strip(" ;") for s in sqlparse.parse(sql)]
+
+        engine = self.get_sqla_engine(schema=schema)
+        username = utils.get_username()
+
+        def needs_conversion(df_series: pd.Series) -> bool:
+            return not df_series.empty and isinstance(df_series[0], (list, dict))
+
+        def _log_query(sql: str) -> None:
+            if log_query:
+                log_query(engine.url, sql, schema, username, __name__, security_manager)
+
+        with closing(engine.raw_connection()) as conn:
+            with closing(conn.cursor()) as cursor:
+                for sql_ in sqls[:-1]:
+                    _log_query(sql_)
+                    self.db_engine_spec.execute(cursor, sql_)
+                    cursor.fetchall()
+
+                _log_query(sqls[-1])
+                self.db_engine_spec.execute(cursor, sqls[-1])
+
+                if cursor.description is not None:
+                    columns = [col_desc[0] for col_desc in cursor.description]
+                else:
+                    columns = []
+
+                df = pd.DataFrame.from_records(
+                    data=list(cursor.fetchall()), columns=columns, coerce_float=True
+                )
+
+                if mutator:
+                    mutator(df)
+
+                for k, v in df.dtypes.items():
+                    if v.type == numpy.object_ and needs_conversion(df[k]):
+                        df[k] = df[k].apply(utils.json_dumps_w_dates)
+                return df
+
+    def compile_sqla_query(self, qry: Select, schema: Optional[str] = None) -> str:
+        engine = self.get_sqla_engine(schema=schema)
+
+        sql = str(qry.compile(engine, compile_kwargs={"literal_binds": True}))
+
+        if (
+            engine.dialect.identifier_preparer._double_percents  # pylint: disable=protected-access
+        ):
+            sql = sql.replace("%%", "%")
+
+        return sql
+
+    def select_star(  # pylint: disable=too-many-arguments
+        self,
+        table_name: str,
+        schema: Optional[str] = None,
+        limit: int = 100,
+        show_cols: bool = False,
+        indent: bool = True,
+        latest_partition: bool = False,
+        cols: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """Generates a ``select *`` statement in the proper dialect"""
+        eng = self.get_sqla_engine(schema=schema, source=utils.QuerySource.SQL_LAB)
+        return self.db_engine_spec.select_star(
+            self,
+            table_name,
+            schema=schema,
+            engine=eng,
+            limit=limit,
+            show_cols=show_cols,
+            indent=indent,
+            latest_partition=latest_partition,
+            cols=cols,
+        )
+
+    def apply_limit_to_sql(self, sql: str, limit: int = 1000) -> str:
+        return self.db_engine_spec.apply_limit_to_sql(sql, limit, self)
+
+    def safe_sqlalchemy_uri(self) -> str:
+        return self.sqlalchemy_uri
+
+    @property
+    def inspector(self) -> Inspector:
+        engine = self.get_sqla_engine()
+        return sqla.inspect(engine)
+
+    @cache_util.memoized_func(
+        key= "db:{}:schema:None:table_list",
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "table")
+
+    @cache_util.memoized_func(
+        key= "db:{}:schema:None:view_list", attribute_in_key="id"
+    )
+    def get_all_view_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "view")
+
+    @cache_util.memoized_func(
+        key= f"db:{{}}:schema:{kwargs.get('schema')}:table_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of tables
+        """
+        try:
+            tables = self.db_engine_spec.get_table_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [
+                utils.DatasourceName(table=table, schema=schema) for table in tables
+            ]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key= f"db:{{}}:schema:{kwargs.get('schema')}:view_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_view_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of views
+        """
+        try:
+            views = self.db_engine_spec.get_view_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [utils.DatasourceName(table=view, schema=schema) for view in views]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key= "db:{}:schema_list", attribute_in_key="id"
+    )
+    def get_all_schema_names(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[str]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: schema list
+        """
+        return self.db_engine_spec.get_schema_names(self.inspector)
+
+    @property
+    def db_engine_spec(self) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(self.backend, db_engine_specs.BaseEngineSpec)
+
+    @classmethod
+    def get_db_engine_spec_for_backend(
+        cls, backend: str
+    ) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(backend, db_engine_specs.BaseEngineSpec)
+
+    def grains(self) -> Tuple[TimeGrain, ...]:
+        """Defines time granularity database-specific expressions.
+
+        The idea here is to make it easy for users to change the time grain
+        from a datetime (maybe the source grain is arbitrary timestamps, daily
+        or 5 minutes increments) to another, "truncated" datetime. Since
+        each database has slightly different but similar datetime functions,
+        this allows a mapping between database engines and actual functions.
+        """
+        return self.db_engine_spec.get_time_grains()
+
+    def get_extra(self) -> Dict[str, Any]:
+        return self.db_engine_spec.get_extra_params(self)
+
+    def get_encrypted_extra(self) -> Dict[str, Any]:
+        encrypted_extra = {}
+        if self.encrypted_extra:
+            try:
+                encrypted_extra = json.loads(self.encrypted_extra)
+            except json.JSONDecodeError as ex:
+                logger.error(ex)
+                raise ex
+        return encrypted_extra
+
+    def get_table(self, table_name: str, schema: Optional[str] = None) -> Table:
+        extra = self.get_extra()
+        meta = MetaData(**extra.get("metadata_params", {}))
+        return Table(
+            table_name,
+            meta,
+            schema=schema or None,
+            autoload=True,
+            autoload_with=self.get_sqla_engine(),
+        )
+
+    def get_columns(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.db_engine_spec.get_columns(self.inspector, table_name, schema)
+
+    def get_indexes(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_indexes(table_name, schema)
+
+    def get_pk_constraint(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> Dict[str, Any]:
+        return self.inspector.get_pk_constraint(table_name, schema)
+
+    def get_foreign_keys(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_foreign_keys(table_name, schema)
+
+    def get_schema_access_for_csv_upload(  # pylint: disable=invalid-name
+        self,
+    ) -> List[str]:
+        allowed_databases = self.get_extra().get("schemas_allowed_for_csv_upload", [])
+        if hasattr(g, "user"):
+            extra_allowed_databases = config["ALLOWED_USER_CSV_SCHEMA_FUNC"](
+                self, g.user
+            )
+            allowed_databases += extra_allowed_databases
+        return sorted(set(allowed_databases))
+
+    @property
+    def sqlalchemy_uri_decrypted(self) -> str:
+        conn = sqla.engine.url.make_url(self.sqlalchemy_uri)
+        if custom_password_store:
+            conn.password = custom_password_store(conn)
+        else:
+            conn.password = self.password
+        return str(conn)
+
+    @property
+    def sql_url(self) -> str:
+        return f"/superset/sql/{self.id}/"
+
+    @hybrid_property
+    def perm(self) -> str:
+        return f"[{self.database_name}].(id:{self.id})"
+
+    @perm.expression  # type: ignore
+    def perm(cls) -> str:  # pylint: disable=no-self-argument
+        return (
+            "[" + cls.database_name + "].(id:" + expression.cast(cls.id, String) + ")"
+        )
+
+    def get_perm(self) -> str:
+        return self.perm  # type: ignore
+
+    def has_table(self, table: Table) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table.table_name, table.schema or None)
+
+    def has_table_by_name(self, table_name: str, schema: Optional[str] = None) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table_name, schema)
+
+    @utils.memoized
+    def get_dialect(self) -> Dialect:
+        sqla_url = url.make_url(self.sqlalchemy_uri_decrypted)
+        return sqla_url.get_dialect()()  # pylint: disable=no-member
+
+
+sqla.event.listen(Database, "after_insert", security_manager.set_perm)
+sqla.event.listen(Database, "after_update", security_manager.set_perm)
+
+
+class Log(Model):  # pylint: disable=too-few-public-methods
+
+    """ORM object used to log Superset actions to the database"""
+
+    __tablename__ = "logs"
+
+    id = Column(Integer, primary_key=True)
+    action = Column(String(512))
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    dashboard_id = Column(Integer)
+    slice_id = Column(Integer)
+    json = Column(Text)
+    user = relationship(
+        security_manager.user_model, backref="logs", foreign_keys=[user_id]
+    )
+    dttm = Column(DateTime, default=datetime.utcnow)
+    duration_ms = Column(Integer)
+    referrer = Column(String(1024))
+
+
+class FavStar(Model):  # pylint: disable=too-few-public-methods
+    __tablename__ = "favstar"
+
+    id = Column(Integer, primary_key=True)
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    class_name = Column(String(50))
+    obj_id = Column(Integer)
+    dttm = Column(DateTime, default=datetime.utcnow)
+
+
+# events for updating tags
+if is_feature_enabled("TAGGING_SYSTEM"):
+    sqla.event.listen(Dashboard, "after_insert", DashboardUpdater.after_insert)
+    sqla.event.listen(Dashboard, "after_update", DashboardUpdater.after_update)
+    sqla.event.listen(Dashboard, "after_delete", DashboardUpdater.after_delete)
+    sqla.event.listen(FavStar, "after_insert", FavStarUpdater.after_insert)
+    sqla.event.listen(FavStar, "after_delete", FavStarUpdater.after_delete)

+ 727 - 0
BI/cube.py

@@ -0,0 +1,727 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long,unused-argument,ungrouped-imports
+"""A collection of ORM sqlalchemy models for Superset"""
+import json
+import logging
+import textwrap
+from contextlib import closing
+from copy import deepcopy
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
+
+import numpy
+import pandas as pd
+import sqlalchemy as sqla
+import sqlparse
+from flask import g, request
+from flask_appbuilder import Model
+from sqlalchemy import (
+    Boolean,
+    Column,
+    create_engine,
+    DateTime,
+    ForeignKey,
+    Integer,
+    MetaData,
+    String,
+    Table,
+    Text,
+)
+from sqlalchemy.engine import Dialect, Engine, url
+from sqlalchemy.engine.reflection import Inspector
+from sqlalchemy.engine.url import make_url, URL
+from sqlalchemy.ext.hybrid import hybrid_property
+from sqlalchemy.orm import relationship
+from sqlalchemy.pool import NullPool
+from sqlalchemy.schema import UniqueConstraint
+from sqlalchemy.sql import expression, Select
+from sqlalchemy_utils import EncryptedType
+
+from superset import app, db_engine_specs, is_feature_enabled, security_manager
+from superset.db_engine_specs.base import TimeGrain
+from superset.models.dashboard import Dashboard
+from superset.models.helpers import AuditMixinNullable, ImportMixin
+from superset.models.tags import DashboardUpdater, FavStarUpdater
+from superset.utils import cache as cache_util, core as utils
+
+config = app.config
+custom_password_store = config["SQLALCHEMY_CUSTOM_PASSWORD_STORE"]
+stats_logger = config["STATS_LOGGER"]
+log_query = config["QUERY_LOGGER"]
+metadata = Model.metadata  # pylint: disable=no-member
+logger = logging.getLogger(__name__)
+
+PASSWORD_MASK = "X" * 10
+DB_CONNECTION_MUTATOR = config["DB_CONNECTION_MUTATOR"]
+
+
+class Url(Model, AuditMixinNullable):
+    """Used for the short url feature"""
+
+    __tablename__ = "url"
+    id = Column(Integer, primary_key=True)
+    url = Column(Text)
+
+
+class KeyValue(Model):  # pylint: disable=too-few-public-methods
+
+    """Used for any type of key-value store"""
+
+    __tablename__ = "keyvalue"
+    id = Column(Integer, primary_key=True)
+    value = Column(Text, nullable=False)
+
+
+class CssTemplate(Model, AuditMixinNullable):
+
+    """CSS templates for dashboards"""
+
+    __tablename__ = "css_templates"
+    id = Column(Integer, primary_key=True)
+    template_name = Column(String(250))
+    css = Column(Text, default="")
+
+
+class Database(
+    Model, AuditMixinNullable, ImportMixin
+):  # pylint: disable=too-many-public-methods
+
+    """An ORM object that stores Database related information"""
+
+    __tablename__ = "dbs"
+    type = "table"
+    __table_args__ = (UniqueConstraint("database_name"),)
+
+    id = Column(Integer, primary_key=True)
+    verbose_name = Column(String(250), unique=True)
+    # short unique name, used in permissions
+    database_name = Column(String(250), unique=True, nullable=False)
+    sqlalchemy_uri = Column(String(1024), nullable=False)
+    password = Column(EncryptedType(String(1024), config["SECRET_KEY"]))
+    cache_timeout = Column(Integer)
+    select_as_create_table_as = Column(Boolean, default=False)
+    expose_in_sqllab = Column(Boolean, default=True)
+    allow_run_async = Column(Boolean, default=False)
+    allow_csv_upload = Column(Boolean, default=False)
+    allow_ctas = Column(Boolean, default=False)
+    allow_cvas = Column(Boolean, default=False)
+    allow_dml = Column(Boolean, default=False)
+    force_ctas_schema = Column(String(250))
+    allow_multi_schema_metadata_fetch = Column(  # pylint: disable=invalid-name
+        Boolean, default=False
+    )
+    extra = Column(
+        Text,
+        default=textwrap.dedent(
+            """\
+    {
+        "metadata_params": {},
+        "engine_params": {},
+        "metadata_cache_timeout": {},
+        "schemas_allowed_for_csv_upload": []
+    }
+    """
+        ),
+    )
+    encrypted_extra = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    impersonate_user = Column(Boolean, default=False)
+    server_cert = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    export_fields = [
+        "database_name",
+        "sqlalchemy_uri",
+        "cache_timeout",
+        "expose_in_sqllab",
+        "allow_run_async",
+        "allow_ctas",
+        "allow_cvas",
+        "allow_csv_upload",
+        "extra",
+    ]
+    export_children = ["tables"]
+
+    def __repr__(self) -> str:
+        return self.name
+
+    @property
+    def name(self) -> str:
+        return self.verbose_name if self.verbose_name else self.database_name
+
+    @property
+    def allows_subquery(self) -> bool:
+        return self.db_engine_spec.allows_subqueries
+
+    @property
+    def function_names(self) -> List[str]:
+        try:
+            return self.db_engine_spec.get_function_names(self)
+        except Exception as ex:  # pylint: disable=broad-except
+            # function_names property is used in bulk APIs and should not hard crash
+            # more info in: https://github.com/apache/incubator-superset/issues/9678
+            logger.error(
+                "Failed to fetch database function names with error: %s", str(ex)
+            )
+        return []
+
+    @property
+    def allows_cost_estimate(self) -> bool:
+        extra = self.get_extra()
+
+        database_version = extra.get("version")
+        cost_estimate_enabled: bool = extra.get("cost_estimate_enabled")  # type: ignore
+
+        return (
+            self.db_engine_spec.get_allow_cost_estimate(database_version)
+            and cost_estimate_enabled
+        )
+
+    @property
+    def allows_virtual_table_explore(self) -> bool:
+        extra = self.get_extra()
+
+        return bool(extra.get("allows_virtual_table_explore", True))
+
+    @property
+    def explore_database_id(self) -> int:
+        return self.get_extra().get("explore_database_id", self.id)
+
+    @property
+    def data(self) -> Dict[str, Any]:
+        return {
+            "id": self.id,
+            "name": self.database_name,
+            "backend": self.backend,
+            "allow_multi_schema_metadata_fetch": self.allow_multi_schema_metadata_fetch,
+            "allows_subquery": self.allows_subquery,
+            "allows_cost_estimate": self.allows_cost_estimate,
+            "allows_virtual_table_explore": self.allows_virtual_table_explore,
+            "explore_database_id": self.explore_database_id,
+        }
+
+    @property
+    def unique_name(self) -> str:
+        return self.database_name
+
+    @property
+    def url_object(self) -> URL:
+        return make_url(self.sqlalchemy_uri_decrypted)
+
+    @property
+    def backend(self) -> str:
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        return sqlalchemy_url.get_backend_name()  # pylint: disable=no-member
+
+    @property
+    def metadata_cache_timeout(self) -> Dict[str, Any]:
+        return self.get_extra().get("metadata_cache_timeout", {})
+
+    @property
+    def schema_cache_enabled(self) -> bool:
+        return "schema_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def schema_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("schema_cache_timeout")
+
+    @property
+    def table_cache_enabled(self) -> bool:
+        return "table_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def table_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("table_cache_timeout")
+
+    @property
+    def default_schemas(self) -> List[str]:
+        return self.get_extra().get("default_schemas", [])
+
+    @property
+    def connect_args(self) -> Dict[str, Any]:
+        return self.get_extra().get("engine_params", {}).get("connect_args", {})
+
+    @classmethod
+    def get_password_masked_url_from_uri(  # pylint: disable=invalid-name
+        cls, uri: str
+    ) -> URL:
+        sqlalchemy_url = make_url(uri)
+        return cls.get_password_masked_url(sqlalchemy_url)
+
+    @classmethod
+    def get_password_masked_url(
+        cls, url: URL  # pylint: disable=redefined-outer-name
+    ) -> URL:
+        url_copy = deepcopy(url)
+        if url_copy.password is not None:
+            url_copy.password = PASSWORD_MASK
+        return url_copy
+
+    def set_sqlalchemy_uri(self, uri: str) -> None:
+        conn = sqla.engine.url.make_url(uri.strip())
+        if conn.password != PASSWORD_MASK and not custom_password_store:
+            # do not over-write the password with the password mask
+            self.password = conn.password
+        conn.password = PASSWORD_MASK if conn.password else None
+        self.sqlalchemy_uri = str(conn)  # hides the password
+
+    def get_effective_user(
+        self,
+        url: URL,  # pylint: disable=redefined-outer-name
+        user_name: Optional[str] = None,
+    ) -> Optional[str]:
+        """
+        Get the effective user, especially during impersonation.
+        :param url: SQL Alchemy URL object
+        :param user_name: Default username
+        :return: The effective username
+        """
+        effective_username = None
+        if self.impersonate_user:
+            effective_username = url.username
+            if user_name:
+                effective_username = user_name
+            elif (
+                hasattr(g, "user")
+                and hasattr(g.user, "username")
+                and g.user.username is not None
+            ):
+                effective_username = g.user.username
+        return effective_username
+
+    @utils.memoized(watch=("impersonate_user", "sqlalchemy_uri_decrypted", "extra"))
+    def get_sqla_engine(
+        self,
+        schema: Optional[str] = None,
+        nullpool: bool = True,
+        user_name: Optional[str] = None,
+        source: Optional[utils.QuerySource] = None,
+    ) -> Engine:
+        extra = self.get_extra()
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        self.db_engine_spec.adjust_database_uri(sqlalchemy_url, schema)
+        effective_username = self.get_effective_user(sqlalchemy_url, user_name)
+        # If using MySQL or Presto for example, will set url.username
+        # If using Hive, will not do anything yet since that relies on a
+        # configuration parameter instead.
+        self.db_engine_spec.modify_url_for_impersonation(
+            sqlalchemy_url, self.impersonate_user, effective_username
+        )
+
+        masked_url = self.get_password_masked_url(sqlalchemy_url)
+        logger.debug("Database.get_sqla_engine(). Masked URL: %s", str(masked_url))
+
+        params = extra.get("engine_params", {})
+        if nullpool:
+            params["poolclass"] = NullPool
+
+        connect_args = params.get("connect_args", {})
+        configuration = connect_args.get("configuration", {})
+
+        # If using Hive, this will set hive.server2.proxy.user=$effective_username
+        configuration.update(
+            self.db_engine_spec.get_configuration_for_impersonation(
+                str(sqlalchemy_url), self.impersonate_user, effective_username
+            )
+        )
+        if configuration:
+            connect_args["configuration"] = configuration
+        if connect_args:
+            params["connect_args"] = connect_args
+
+        params.update(self.get_encrypted_extra())
+
+        if DB_CONNECTION_MUTATOR:
+            if not source and request and request.referrer:
+                if "/superset/dashboard/" in request.referrer:
+                    source = utils.QuerySource.DASHBOARD
+                elif "/superset/explore/" in request.referrer:
+                    source = utils.QuerySource.CHART
+                elif "/superset/sqllab/" in request.referrer:
+                    source = utils.QuerySource.SQL_LAB
+
+            sqlalchemy_url, params = DB_CONNECTION_MUTATOR(
+                sqlalchemy_url, params, effective_username, security_manager, source
+            )
+
+        return create_engine(sqlalchemy_url, **params)
+
+    def get_reserved_words(self) -> Set[str]:
+        return self.get_dialect().preparer.reserved_words
+
+    def get_quoter(self) -> Callable[[str, Any], str]:
+        return self.get_dialect().identifier_preparer.quote
+
+    def get_df(  # pylint: disable=too-many-locals
+        self,
+        sql: str,
+        schema: Optional[str] = None,
+        mutator: Optional[Callable[[pd.DataFrame], None]] = None,
+    ) -> pd.DataFrame:
+        sqls = [str(s).strip(" ;") for s in sqlparse.parse(sql)]
+
+        engine = self.get_sqla_engine(schema=schema)
+        username = utils.get_username()
+
+        def needs_conversion(df_series: pd.Series) -> bool:
+            return not df_series.empty and isinstance(df_series[0], (list, dict))
+
+        def _log_query(sql: str) -> None:
+            if log_query:
+                log_query(engine.url, sql, schema, username, __name__, security_manager)
+
+        with closing(engine.raw_connection()) as conn:
+            with closing(conn.cursor()) as cursor:
+                for sql_ in sqls[:-1]:
+                    _log_query(sql_)
+                    self.db_engine_spec.execute(cursor, sql_)
+                    cursor.fetchall()
+
+                _log_query(sqls[-1])
+                self.db_engine_spec.execute(cursor, sqls[-1])
+
+                if cursor.description is not None:
+                    columns = [col_desc[0] for col_desc in cursor.description]
+                else:
+                    columns = []
+
+                df = pd.DataFrame.from_records(
+                    data=list(cursor.fetchall()), columns=columns, coerce_float=True
+                )
+
+                if mutator:
+                    mutator(df)
+
+                for k, v in df.dtypes.items():
+                    if v.type == numpy.object_ and needs_conversion(df[k]):
+                        df[k] = df[k].apply(utils.json_dumps_w_dates)
+                return df
+
+    def compile_sqla_query(self, qry: Select, schema: Optional[str] = None) -> str:
+        engine = self.get_sqla_engine(schema=schema)
+
+        sql = str(qry.compile(engine, compile_kwargs={"literal_binds": True}))
+
+        if (
+            engine.dialect.identifier_preparer._double_percents  # pylint: disable=protected-access
+        ):
+            sql = sql.replace("%%", "%")
+
+        return sql
+
+    def select_star(  # pylint: disable=too-many-arguments
+        self,
+        table_name: str,
+        schema: Optional[str] = None,
+        limit: int = 100,
+        show_cols: bool = False,
+        indent: bool = True,
+        latest_partition: bool = False,
+        cols: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """Generates a ``select *`` statement in the proper dialect"""
+        eng = self.get_sqla_engine(schema=schema, source=utils.QuerySource.SQL_LAB)
+        return self.db_engine_spec.select_star(
+            self,
+            table_name,
+            schema=schema,
+            engine=eng,
+            limit=limit,
+            show_cols=show_cols,
+            indent=indent,
+            latest_partition=latest_partition,
+            cols=cols,
+        )
+
+    def apply_limit_to_sql(self, sql: str, limit: int = 1000) -> str:
+        return self.db_engine_spec.apply_limit_to_sql(sql, limit, self)
+
+    def safe_sqlalchemy_uri(self) -> str:
+        return self.sqlalchemy_uri
+
+    @property
+    def inspector(self) -> Inspector:
+        engine = self.get_sqla_engine()
+        return sqla.inspect(engine)
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: "db:{}:schema:None:table_list",
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "table")
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: "db:{}:schema:None:view_list", attribute_in_key="id"
+    )
+    def get_all_view_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "view")
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: f"db:{{}}:schema:{kwargs.get('schema')}:table_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of tables
+        """
+        try:
+            tables = self.db_engine_spec.get_table_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [
+                utils.DatasourceName(table=table, schema=schema) for table in tables
+            ]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: f"db:{{}}:schema:{kwargs.get('schema')}:view_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_view_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of views
+        """
+        try:
+            views = self.db_engine_spec.get_view_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [utils.DatasourceName(table=view, schema=schema) for view in views]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: "db:{}:schema_list", attribute_in_key="id"
+    )
+    def get_all_schema_names(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[str]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: schema list
+        """
+        return self.db_engine_spec.get_schema_names(self.inspector)
+
+    @property
+    def db_engine_spec(self) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(self.backend, db_engine_specs.BaseEngineSpec)
+
+    @classmethod
+    def get_db_engine_spec_for_backend(
+        cls, backend: str
+    ) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(backend, db_engine_specs.BaseEngineSpec)
+
+    def grains(self) -> Tuple[TimeGrain, ...]:
+        """Defines time granularity database-specific expressions.
+
+        The idea here is to make it easy for users to change the time grain
+        from a datetime (maybe the source grain is arbitrary timestamps, daily
+        or 5 minutes increments) to another, "truncated" datetime. Since
+        each database has slightly different but similar datetime functions,
+        this allows a mapping between database engines and actual functions.
+        """
+        return self.db_engine_spec.get_time_grains()
+
+    def get_extra(self) -> Dict[str, Any]:
+        return self.db_engine_spec.get_extra_params(self)
+
+    def get_encrypted_extra(self) -> Dict[str, Any]:
+        encrypted_extra = {}
+        if self.encrypted_extra:
+            try:
+                encrypted_extra = json.loads(self.encrypted_extra)
+            except json.JSONDecodeError as ex:
+                logger.error(ex)
+                raise ex
+        return encrypted_extra
+
+    def get_table(self, table_name: str, schema: Optional[str] = None) -> Table:
+        extra = self.get_extra()
+        meta = MetaData(**extra.get("metadata_params", {}))
+        return Table(
+            table_name,
+            meta,
+            schema=schema or None,
+            autoload=True,
+            autoload_with=self.get_sqla_engine(),
+        )
+
+    def get_columns(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.db_engine_spec.get_columns(self.inspector, table_name, schema)
+
+    def get_indexes(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_indexes(table_name, schema)
+
+    def get_pk_constraint(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> Dict[str, Any]:
+        return self.inspector.get_pk_constraint(table_name, schema)
+
+    def get_foreign_keys(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_foreign_keys(table_name, schema)
+
+    def get_schema_access_for_csv_upload(  # pylint: disable=invalid-name
+        self,
+    ) -> List[str]:
+        allowed_databases = self.get_extra().get("schemas_allowed_for_csv_upload", [])
+        if hasattr(g, "user"):
+            extra_allowed_databases = config["ALLOWED_USER_CSV_SCHEMA_FUNC"](
+                self, g.user
+            )
+            allowed_databases += extra_allowed_databases
+        return sorted(set(allowed_databases))
+
+    @property
+    def sqlalchemy_uri_decrypted(self) -> str:
+        conn = sqla.engine.url.make_url(self.sqlalchemy_uri)
+        if custom_password_store:
+            conn.password = custom_password_store(conn)
+        else:
+            conn.password = self.password
+        return str(conn)
+
+    @property
+    def sql_url(self) -> str:
+        return f"/superset/sql/{self.id}/"
+
+    @hybrid_property
+    def perm(self) -> str:
+        return f"[{self.database_name}].(id:{self.id})"
+
+    @perm.expression  # type: ignore
+    def perm(cls) -> str:  # pylint: disable=no-self-argument
+        return (
+            "[" + cls.database_name + "].(id:" + expression.cast(cls.id, String) + ")"
+        )
+
+    def get_perm(self) -> str:
+        return self.perm  # type: ignore
+
+    def has_table(self, table: Table) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table.table_name, table.schema or None)
+
+    def has_table_by_name(self, table_name: str, schema: Optional[str] = None) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table_name, schema)
+
+    @utils.memoized
+    def get_dialect(self) -> Dialect:
+        sqla_url = url.make_url(self.sqlalchemy_uri_decrypted)
+        return sqla_url.get_dialect()()  # pylint: disable=no-member
+
+
+sqla.event.listen(Database, "after_insert", security_manager.set_perm)
+sqla.event.listen(Database, "after_update", security_manager.set_perm)
+
+
+class Log(Model):  # pylint: disable=too-few-public-methods
+
+    """ORM object used to log Superset actions to the database"""
+
+    __tablename__ = "logs"
+
+    id = Column(Integer, primary_key=True)
+    action = Column(String(512))
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    dashboard_id = Column(Integer)
+    slice_id = Column(Integer)
+    json = Column(Text)
+    user = relationship(
+        security_manager.user_model, backref="logs", foreign_keys=[user_id]
+    )
+    dttm = Column(DateTime, default=datetime.utcnow)
+    duration_ms = Column(Integer)
+    referrer = Column(String(1024))
+
+
+class FavStar(Model):  # pylint: disable=too-few-public-methods
+    __tablename__ = "favstar"
+
+    id = Column(Integer, primary_key=True)
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    class_name = Column(String(50))
+    obj_id = Column(Integer)
+    dttm = Column(DateTime, default=datetime.utcnow)
+
+
+# events for updating tags
+if is_feature_enabled("TAGGING_SYSTEM"):
+    sqla.event.listen(Dashboard, "after_insert", DashboardUpdater.after_insert)
+    sqla.event.listen(Dashboard, "after_update", DashboardUpdater.after_update)
+    sqla.event.listen(Dashboard, "after_delete", DashboardUpdater.after_delete)
+    sqla.event.listen(FavStar, "after_insert", FavStarUpdater.after_insert)
+    sqla.event.listen(FavStar, "after_delete", FavStarUpdater.after_delete)

+ 197 - 0
BI/etl_testing.py

@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue May 12 00:00:00 2020
+
+@author: Shaji
+"""
+
+from . import exceptions
+
+from datetime import datetime
+import os
+import pandas as pd
+
+def column_level_check(source_df,target_df,primary_keys):
+    """
+    Usage: [arg1]:[Pandas DataFrame - source], [arg2]:[Pandas DataFrame - target], [arg3]:[Primary keys (separated by comma)]
+    Description: Performs column level testing between two DataFrames by joining using the primary keys.
+    Returns: [Mismatch Count], [Test Log (list)], [Pandas dataframe - mismatch (if any)]
+    """
+    global execution_status
+
+    systime=datetime.now()
+
+    start_time=systime.strftime("%Y")+'-'+systime.strftime("%m")+'-'+systime.strftime("%d")+' '+systime.strftime("%H")+':'+systime.strftime("%M")+':'+systime.strftime("%S")
+
+    log_list=[]
+
+    execution_status='RUNNING'
+
+    log_list.append('START TIME: '+start_time)
+
+    key_list=primary_keys.split(',')
+
+    src=source_df
+    tgt=target_df
+
+    log_list.append(str(datetime.now())+': DIFFERENTIATING SOURCE AND TARGET COLUMNS')
+    if execution_status!='FAILED':
+        try:
+            src_k=[]
+            src_columns=[]
+            for i  in src.columns:
+                if str.lower(i) in [str.lower(key) for key in key_list]:
+                    src_columns.append(str.lower(i))
+                    src_k.append(str.lower(i))
+                else:
+                    src_columns.append(str(i) + '_src')
+            src.columns = src_columns
+            tgt_k=[]
+            tgt_columns=[]
+            for i  in tgt.columns:
+                if str.lower(i) in [str.lower(key) for key in key_list]:
+                    tgt_columns.append(str.lower(i))
+                    tgt_k.append(str.lower(i))
+                else:
+                    tgt_columns.append(str(i) + '_tgt')
+            tgt.columns = tgt_columns
+        except Exception as e:
+            print('Failed while DIFFERENTIATING SOURCE AND TARGET COLUMNS: '+str(e))
+            log_list.append('Failed while DIFFERENTIATING SOURCE AND TARGET COLUMNS: '+str(e))
+            execution_status='FAILED'
+    log_list.append(str(datetime.now())+': CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL')
+    if execution_status!='FAILED':
+        try:
+            index_unique_flag=[]
+            if src.groupby(src_k).count().shape[0]==src.shape[0]:
+                index_unique_flag.append(True)
+            else:
+                index_unique_flag.append(False)
+            if tgt.groupby(tgt_k).count().shape[0]==tgt.shape[0]:
+                index_unique_flag.append(True)
+            else:
+                index_unique_flag.append(False)
+        except Exception as e:
+            print('Failed while CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL: '+str(e))
+            log_list.append('Failed while CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL: '+str(e))
+            execution_status='FAILED'
+    if execution_status!='FAILED':
+        try:
+            if all(index_unique_flag)==True:
+                log_list.append(str(datetime.now())+': JOINING THE TABLES')
+                try:
+                    df=tgt.set_index(tgt_k).join(src.set_index(src_k),how='left')
+                except Exception as e:
+                    print('Failed while JOINING THE TABLES: '+str(e))
+                    log_list.append('Failed while JOINING THE TABLES: '+str(e))
+                    execution_status='FAILED'
+                log_list.append(str(datetime.now())+': FINDING THE TARGET COLUMN AND SOURCE COLUMN TO BE COMPARED')
+                if execution_status!='FAILED':
+                    try:
+                        ma_list=[]
+                        for i in range(len(df.columns)):
+                            if df.columns[i][-3:]=='tgt':
+                                for j in range(len(df.columns)):
+                                    if df.columns[j][-3:]=='src':
+                                        if str.lower(df.columns[i][:-4])==str.lower(df.columns[j][:-4]):
+                                            ma_list.append([j,i])
+                        match_cols=''
+                        for i in range(len(ma_list)):
+                            match_cols+=str(i+1)+': '+df.columns[ma_list[i][1]]+' = '+df.columns[ma_list[i][0]]+' , '
+                        log_list.append('Matching columns '+match_cols)
+                    except Exception as e:
+                        print('Failed while FINDING THE TARGET COLUMN AND SOURCE COLUMN TO BE COMPARED: '+str(e))
+                        log_list.append('Failed while FINDING THE TARGET COLUMN AND SOURCE COLUMN TO BE COMPARED: '+str(e))
+                        execution_status='FAILED'
+                log_list.append(str(datetime.now())+': COMPARISION STARTED')
+                if execution_status!='FAILED':
+                    try:
+                        mis_cols=[]
+                        res=[]
+                        index=[]
+                        for i in range(len(ma_list)):
+                            if all(df[df.columns[ma_list[i][0]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0))==df[df.columns[ma_list[i][1]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0)))==True:
+                                res.append(True)
+                            else:
+                                res.append(False)
+                                mis_cols.append(df.columns[ma_list[i][0]])
+                                mis_cols.append(df.columns[ma_list[i][1]])
+                                for j in range(len(df[df.columns[ma_list[i][0]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0))==df[df.columns[ma_list[i][1]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0)))):
+                                    if list(df[df.columns[ma_list[i][0]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0))==df[df.columns[ma_list[i][1]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0)))[j]==False:
+                                        index.append(j)
+                        un_df=df[mis_cols].iloc[list(set(index))]
+                    except Exception as e:
+                        print('Failed while COMPARING: '+str(e))
+                        log_list.append('Failed while COMPARING: '+str(e))
+                        execution_status='FAILED'
+                log_list.append(str(datetime.now())+': TEST RESULT:')
+                if execution_status!='FAILED':
+                    try:
+                        if all(res)==True:
+                            mismatch_count=0
+                            print('COLUMN LEVEL CHECK PASSED')
+                            execution_status='SUCCESS'
+                            log_list.append('COLUMN LEVEL CHECK PASSED')
+                        else:
+                            log_list.append((str(len(set(index)))+' records unmatched'))
+                            log_list.append('Column level check Failed')
+                            mismatch_count=str(len(set(index)))
+                            execution_status='SUCCESS'
+                    except Exception as e:
+                        print('Failed while getting the TEST RESULT: '+str(e))
+                        log_list.append('Failed while getting the TEST RESULT: '+str(e))
+                        execution_status='FAILED'
+            else:
+                log_list.append('The records grouped at the level of key columns are not unique')
+        except Exception as e:
+            log_list.append('Failed while CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL: '+str(e))
+            execution_status='FAILED'
+    if execution_status=='FAILED':
+        print('Check Logs for the error message')
+        raise exceptionsExecutionError
+    return mismatch_count,log_list,un_df
+
+def sort_and_compare(source_df,target_df):
+    """
+    Usage: [arg1]:[Pandas DataFrame - source], [arg2]:[Pandas DataFrame - target]
+    Description: Sort and Compare two datasets.
+    Returns: [Mismatch Count], [Test Log (list)], [Pandas dataframe - mismatch (if any)]
+    """
+    log_list=[]
+    col1=source_df.columns
+    col2=target_df.columns
+    cols=list(set(col1.sort_values()).intersection(set(col2.sort_values())))
+    log_list.append('Common column(s): '+', '.join(cols))
+
+    source_df.sort_values(cols, axis=0, ascending=True, inplace=True)
+    target_df.sort_values(cols, axis=0, ascending=True, inplace=True)
+
+    data1=source_df[cols].reset_index(drop=True)
+    data2=target_df[cols].reset_index(drop=True)
+
+    data1.head()
+    data2.head()
+
+    result=data1==data2
+    bool_list=[]
+    mis_cols=[]
+    mis_index=[]
+    for i in cols:
+        if all(result[i])==True:
+            bool_list.append(True)
+        else:
+            bool_list.append(False)
+            mis_cols.append(i)
+            for j in range(len(result[i])):
+                if result[i][j]==False:
+                    mis_index.append(j)
+    un_df=pd.concat([data1.iloc[list(set(mis_index))],data2.iloc[list(set(mis_index))]],axis=1)
+
+    mismatch_count=0
+    if all(bool_list)==True:
+        log_list.append('Records are matching')
+    else:
+        mismatch_count=len(set(mis_index))
+        log_list.append(str(mismatch_count)+' records unmatched')
+        log_list.append('Column(s): '+', '.join(mis_cols))
+    return mismatch_count,log_list,un_df[mis_cols]

+ 33 - 0
BI/examples/__init__.py

@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from .bart_lines import load_bart_lines
+from .birth_names import load_birth_names
+from .country_map import load_country_map_data
+from .css_templates import load_css_templates
+from .deck import load_deck_dash
+from .energy import load_energy
+from .flights import load_flights
+from .long_lat import load_long_lat_data
+from .misc_dashboard import load_misc_dashboard
+from .multi_line import load_multi_line
+from .multiformat_time_series import load_multiformat_time_series
+from .paris import load_paris_iris_geojson
+from .random_time_series import load_random_time_series_data
+from .sf_population_polygons import load_sf_population_polygons
+from .tabbed_dashboard import load_tabbed_dashboard
+from .unicode_test_data import load_unicode_test_data
+from .world_bank import load_world_bank_health_n_pop

+ 63 - 0
BI/examples/bart_lines.py

@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+import pandas as pd
+import polyline
+from sqlalchemy import String, Text
+
+from superset import db
+from superset.utils.core import get_example_database
+
+from .helpers import get_example_data, TBL
+
+
+def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
+    tbl_name = "bart_lines"
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        content = get_example_data("bart-lines.json.gz")
+        df = pd.read_json(content, encoding="latin-1")
+        df["path_json"] = df.path.map(json.dumps)
+        df["polyline"] = df.path.map(polyline.encode)
+        del df["path"]
+
+        df.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "color": String(255),
+                "name": String(255),
+                "polyline": Text,
+                "path_json": Text,
+            },
+            index=False,
+        )
+
+    print("Creating table {} reference".format(tbl_name))
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "BART lines"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()

+ 763 - 0
BI/examples/birth_names.py

@@ -0,0 +1,763 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import textwrap
+from typing import Dict, Union
+
+import pandas as pd
+from sqlalchemy import DateTime, String
+from sqlalchemy.sql import column
+
+from superset import db, security_manager
+from superset.connectors.sqla.models import SqlMetric, TableColumn
+from superset.models.core import Database
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.utils.core import get_example_database
+
+from .helpers import (
+    config,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+    update_slice_ids,
+)
+
+
+def gen_filter(
+    subject: str, comparator: str, operator: str = "=="
+) -> Dict[str, Union[bool, str]]:
+    return {
+        "clause": "WHERE",
+        "comparator": comparator,
+        "expressionType": "SIMPLE",
+        "operator": operator,
+        "subject": subject,
+    }
+
+
+def load_data(tbl_name: str, database: Database) -> None:
+    pdf = pd.read_json(get_example_data("birth_names.json.gz"))
+    pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
+    pdf.to_sql(
+        tbl_name,
+        database.get_sqla_engine(),
+        if_exists="replace",
+        chunksize=500,
+        dtype={
+            "ds": DateTime,
+            "gender": String(16),
+            "state": String(10),
+            "name": String(255),
+        },
+        index=False,
+    )
+    print("Done loading table!")
+    print("-" * 80)
+
+
+def load_birth_names(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading birth name dataset from a zip file in the repo"""
+    # pylint: disable=too-many-locals
+    tbl_name = "birth_names"
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        load_data(tbl_name, database)
+
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        print(f"Creating table [{tbl_name}] reference")
+        obj = TBL(table_name=tbl_name)
+        db.session.add(obj)
+    obj.main_dttm_col = "ds"
+    obj.database = database
+    obj.filter_select_enabled = True
+
+    if not any(col.column_name == "num_california" for col in obj.columns):
+        col_state = str(column("state").compile(db.engine))
+        col_num = str(column("num").compile(db.engine))
+        obj.columns.append(
+            TableColumn(
+                column_name="num_california",
+                expression=f"CASE WHEN {col_state} = 'CA' THEN {col_num} ELSE 0 END",
+            )
+        )
+
+    if not any(col.metric_name == "sum__num" for col in obj.metrics):
+        col = str(column("num").compile(db.engine))
+        obj.metrics.append(SqlMetric(metric_name="sum__num", expression=f"SUM({col})"))
+
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    metrics = [
+        {
+            "expressionType": "SIMPLE",
+            "column": {"column_name": "num", "type": "BIGINT"},
+            "aggregate": "SUM",
+            "label": "Births",
+            "optionName": "metric_11",
+        }
+    ]
+    metric = "sum__num"
+
+    defaults = {
+        "compare_lag": "10",
+        "compare_suffix": "o10Y",
+        "limit": "25",
+        "granularity_sqla": "ds",
+        "groupby": [],
+        "row_limit": config["ROW_LIMIT"],
+        "since": "100 years ago",
+        "until": "now",
+        "viz_type": "table",
+        "markup_type": "markdown",
+    }
+
+    admin = security_manager.find_user("admin")
+
+    print("Creating some slices")
+    slices = [
+        Slice(
+            slice_name="Participants",
+            viz_type="big_number",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="big_number",
+                granularity_sqla="ds",
+                compare_lag="5",
+                compare_suffix="over 5Y",
+                metric=metric,
+            ),
+        ),
+        Slice(
+            slice_name="Genders",
+            viz_type="pie",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults, viz_type="pie", groupby=["gender"], metric=metric
+            ),
+        ),
+        Slice(
+            slice_name="Trends",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="line",
+                groupby=["name"],
+                granularity_sqla="ds",
+                rich_tooltip=True,
+                show_legend=True,
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Genders by State",
+            viz_type="dist_bar",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                adhoc_filters=[
+                    {
+                        "clause": "WHERE",
+                        "expressionType": "SIMPLE",
+                        "filterOptionName": "2745eae5",
+                        "comparator": ["other"],
+                        "operator": "NOT IN",
+                        "subject": "state",
+                    }
+                ],
+                viz_type="dist_bar",
+                metrics=[
+                    {
+                        "expressionType": "SIMPLE",
+                        "column": {"column_name": "sum_boys", "type": "BIGINT(20)"},
+                        "aggregate": "SUM",
+                        "label": "Boys",
+                        "optionName": "metric_11",
+                    },
+                    {
+                        "expressionType": "SIMPLE",
+                        "column": {"column_name": "sum_girls", "type": "BIGINT(20)"},
+                        "aggregate": "SUM",
+                        "label": "Girls",
+                        "optionName": "metric_12",
+                    },
+                ],
+                groupby=["state"],
+            ),
+        ),
+        Slice(
+            slice_name="Girls",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                groupby=["name"],
+                adhoc_filters=[gen_filter("gender", "girl")],
+                row_limit=50,
+                timeseries_limit_metric="sum__num",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Girl Name Cloud",
+            viz_type="word_cloud",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="word_cloud",
+                size_from="10",
+                series="name",
+                size_to="70",
+                rotation="square",
+                limit="100",
+                adhoc_filters=[gen_filter("gender", "girl")],
+                metric=metric,
+            ),
+        ),
+        Slice(
+            slice_name="Boys",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                groupby=["name"],
+                adhoc_filters=[gen_filter("gender", "boy")],
+                row_limit=50,
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Boy Name Cloud",
+            viz_type="word_cloud",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="word_cloud",
+                size_from="10",
+                series="name",
+                size_to="70",
+                rotation="square",
+                limit="100",
+                adhoc_filters=[gen_filter("gender", "boy")],
+                metric=metric,
+            ),
+        ),
+        Slice(
+            slice_name="Top 10 Girl Name Share",
+            viz_type="area",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                adhoc_filters=[gen_filter("gender", "girl")],
+                comparison_type="values",
+                groupby=["name"],
+                limit=10,
+                stacked_style="expand",
+                time_grain_sqla="P1D",
+                viz_type="area",
+                x_axis_forma="smart_date",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Top 10 Boy Name Share",
+            viz_type="area",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                adhoc_filters=[gen_filter("gender", "boy")],
+                comparison_type="values",
+                groupby=["name"],
+                limit=10,
+                stacked_style="expand",
+                time_grain_sqla="P1D",
+                viz_type="area",
+                x_axis_forma="smart_date",
+                metrics=metrics,
+            ),
+        ),
+    ]
+    misc_slices = [
+        Slice(
+            slice_name="Average and Sum Trends",
+            viz_type="dual_line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="dual_line",
+                metric={
+                    "expressionType": "SIMPLE",
+                    "column": {"column_name": "num", "type": "BIGINT(20)"},
+                    "aggregate": "AVG",
+                    "label": "AVG(num)",
+                    "optionName": "metric_vgops097wej_g8uff99zhk7",
+                },
+                metric_2="sum__num",
+                granularity_sqla="ds",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Num Births Trend",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(defaults, viz_type="line", metrics=metrics),
+        ),
+        Slice(
+            slice_name="Daily Totals",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            created_by=admin,
+            params=get_slice_json(
+                defaults,
+                groupby=["ds"],
+                since="40 years ago",
+                until="now",
+                viz_type="table",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Number of California Births",
+            viz_type="big_number_total",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metric={
+                    "expressionType": "SIMPLE",
+                    "column": {
+                        "column_name": "num_california",
+                        "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                    },
+                    "aggregate": "SUM",
+                    "label": "SUM(num_california)",
+                },
+                viz_type="big_number_total",
+                granularity_sqla="ds",
+            ),
+        ),
+        Slice(
+            slice_name="Top 10 California Names Timeseries",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metrics=[
+                    {
+                        "expressionType": "SIMPLE",
+                        "column": {
+                            "column_name": "num_california",
+                            "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                        },
+                        "aggregate": "SUM",
+                        "label": "SUM(num_california)",
+                    }
+                ],
+                viz_type="line",
+                granularity_sqla="ds",
+                groupby=["name"],
+                timeseries_limit_metric={
+                    "expressionType": "SIMPLE",
+                    "column": {
+                        "column_name": "num_california",
+                        "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                    },
+                    "aggregate": "SUM",
+                    "label": "SUM(num_california)",
+                },
+                limit="10",
+            ),
+        ),
+        Slice(
+            slice_name="Names Sorted by Num in California",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metrics=metrics,
+                groupby=["name"],
+                row_limit=50,
+                timeseries_limit_metric={
+                    "expressionType": "SIMPLE",
+                    "column": {
+                        "column_name": "num_california",
+                        "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                    },
+                    "aggregate": "SUM",
+                    "label": "SUM(num_california)",
+                },
+            ),
+        ),
+        Slice(
+            slice_name="Number of Girls",
+            viz_type="big_number_total",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metric=metric,
+                viz_type="big_number_total",
+                granularity_sqla="ds",
+                adhoc_filters=[gen_filter("gender", "girl")],
+                subheader="total female participants",
+            ),
+        ),
+        Slice(
+            slice_name="Pivot Table",
+            viz_type="pivot_table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="pivot_table",
+                groupby=["name"],
+                columns=["state"],
+                metrics=metrics,
+            ),
+        ),
+    ]
+    for slc in slices:
+        merge_slice(slc)
+
+    for slc in misc_slices:
+        merge_slice(slc)
+        misc_dash_slices.add(slc.slice_name)
+
+    print("Creating a dashboard")
+    dash = db.session.query(Dashboard).filter_by(slug="births").first()
+
+    if not dash:
+        dash = Dashboard()
+        db.session.add(dash)
+    dash.published = True
+    dash.json_metadata = textwrap.dedent(
+        """\
+    {
+        "label_colors": {
+            "Girls": "#FF69B4",
+            "Boys": "#ADD8E6",
+            "girl": "#FF69B4",
+            "boy": "#ADD8E6"
+        }
+    }"""
+    )
+    js = textwrap.dedent(
+        # pylint: disable=line-too-long
+        """\
+        {
+          "CHART-6GdlekVise": {
+            "children": [],
+            "id": "CHART-6GdlekVise",
+            "meta": {
+              "chartId": 5547,
+              "height": 50,
+              "sliceName": "Top 10 Girl Name Share",
+              "width": 5
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-eh0w37bWbR"
+            ],
+            "type": "CHART"
+          },
+          "CHART-6n9jxb30JG": {
+            "children": [],
+            "id": "CHART-6n9jxb30JG",
+            "meta": {
+              "chartId": 5540,
+              "height": 36,
+              "sliceName": "Genders by State",
+              "width": 5
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW--EyBZQlDi"
+            ],
+            "type": "CHART"
+          },
+          "CHART-Jj9qh1ol-N": {
+            "children": [],
+            "id": "CHART-Jj9qh1ol-N",
+            "meta": {
+              "chartId": 5545,
+              "height": 50,
+              "sliceName": "Boy Name Cloud",
+              "width": 4
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-kzWtcvo8R1"
+            ],
+            "type": "CHART"
+          },
+          "CHART-ODvantb_bF": {
+            "children": [],
+            "id": "CHART-ODvantb_bF",
+            "meta": {
+              "chartId": 5548,
+              "height": 50,
+              "sliceName": "Top 10 Boy Name Share",
+              "width": 5
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-kzWtcvo8R1"
+            ],
+            "type": "CHART"
+          },
+          "CHART-PAXUUqwmX9": {
+            "children": [],
+            "id": "CHART-PAXUUqwmX9",
+            "meta": {
+              "chartId": 5538,
+              "height": 34,
+              "sliceName": "Genders",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-2n0XgiHDgs"
+            ],
+            "type": "CHART"
+          },
+          "CHART-_T6n_K9iQN": {
+            "children": [],
+            "id": "CHART-_T6n_K9iQN",
+            "meta": {
+              "chartId": 5539,
+              "height": 36,
+              "sliceName": "Trends",
+              "width": 7
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW--EyBZQlDi"
+            ],
+            "type": "CHART"
+          },
+          "CHART-eNY0tcE_ic": {
+            "children": [],
+            "id": "CHART-eNY0tcE_ic",
+            "meta": {
+              "chartId": 5537,
+              "height": 34,
+              "sliceName": "Participants",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-2n0XgiHDgs"
+            ],
+            "type": "CHART"
+          },
+          "CHART-g075mMgyYb": {
+            "children": [],
+            "id": "CHART-g075mMgyYb",
+            "meta": {
+              "chartId": 5541,
+              "height": 50,
+              "sliceName": "Girls",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-eh0w37bWbR"
+            ],
+            "type": "CHART"
+          },
+          "CHART-n-zGGE6S1y": {
+            "children": [],
+            "id": "CHART-n-zGGE6S1y",
+            "meta": {
+              "chartId": 5542,
+              "height": 50,
+              "sliceName": "Girl Name Cloud",
+              "width": 4
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-eh0w37bWbR"
+            ],
+            "type": "CHART"
+          },
+          "CHART-vJIPjmcbD3": {
+            "children": [],
+            "id": "CHART-vJIPjmcbD3",
+            "meta": {
+              "chartId": 5543,
+              "height": 50,
+              "sliceName": "Boys",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-kzWtcvo8R1"
+            ],
+            "type": "CHART"
+          },
+          "DASHBOARD_VERSION_KEY": "v2",
+          "GRID_ID": {
+            "children": [
+              "ROW-2n0XgiHDgs",
+              "ROW--EyBZQlDi",
+              "ROW-eh0w37bWbR",
+              "ROW-kzWtcvo8R1"
+            ],
+            "id": "GRID_ID",
+            "parents": [
+              "ROOT_ID"
+            ],
+            "type": "GRID"
+          },
+          "HEADER_ID": {
+            "id": "HEADER_ID",
+            "meta": {
+              "text": "Births"
+            },
+            "type": "HEADER"
+          },
+          "MARKDOWN-zaflB60tbC": {
+            "children": [],
+            "id": "MARKDOWN-zaflB60tbC",
+            "meta": {
+              "code": "<div style=\\"text-align:center\\">  <h1>Birth Names Dashboard</h1>  <img src=\\"/static/assets/images/babies.png\\" style=\\"width:50%;\\"></div>",
+              "height": 34,
+              "width": 6
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-2n0XgiHDgs"
+            ],
+            "type": "MARKDOWN"
+          },
+          "ROOT_ID": {
+            "children": [
+              "GRID_ID"
+            ],
+            "id": "ROOT_ID",
+            "type": "ROOT"
+          },
+          "ROW--EyBZQlDi": {
+            "children": [
+              "CHART-_T6n_K9iQN",
+              "CHART-6n9jxb30JG"
+            ],
+            "id": "ROW--EyBZQlDi",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          },
+          "ROW-2n0XgiHDgs": {
+            "children": [
+              "CHART-eNY0tcE_ic",
+              "MARKDOWN-zaflB60tbC",
+              "CHART-PAXUUqwmX9"
+            ],
+            "id": "ROW-2n0XgiHDgs",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          },
+          "ROW-eh0w37bWbR": {
+            "children": [
+              "CHART-g075mMgyYb",
+              "CHART-n-zGGE6S1y",
+              "CHART-6GdlekVise"
+            ],
+            "id": "ROW-eh0w37bWbR",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          },
+          "ROW-kzWtcvo8R1": {
+            "children": [
+              "CHART-vJIPjmcbD3",
+              "CHART-Jj9qh1ol-N",
+              "CHART-ODvantb_bF"
+            ],
+            "id": "ROW-kzWtcvo8R1",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          }
+        }
+        """  # pylint: enable=line-too-long
+    )
+    pos = json.loads(js)
+    # dashboard v2 doesn't allow add markup slice
+    dash.slices = [slc for slc in slices if slc.viz_type != "markup"]
+    update_slice_ids(pos, dash.slices)
+    dash.dashboard_title = "USA Births Names"
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = "births"
+    db.session.commit()

+ 373 - 0
BI/examples/countries.md

@@ -0,0 +1,373 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+This data was downloaded from the
+[World's Health Organization's website](https://datacatalog.worldbank.org/dataset/health-nutrition-and-population-statistics)
+
+Here's the script that was used to massage the data:
+
+    DIR = ""
+    df_country = pd.read_csv(DIR + '/HNP_Country.csv')
+    df_country.columns = ['country_code'] + list(df_country.columns[1:])
+    df_country = df_country[['country_code', 'Region']]
+    df_country.columns = ['country_code', 'region']
+
+    df = pd.read_csv(DIR + '/HNP_Data.csv')
+    del df['Unnamed: 60']
+    df.columns = ['country_name', 'country_code'] + list(df.columns[2:])
+    ndf = df.merge(df_country, how='inner')
+
+    dims = ('country_name', 'country_code', 'region')
+    vv = [str(i) for i in range(1960, 2015)]
+    mdf = pd.melt(ndf, id_vars=dims + ('Indicator Code',), value_vars=vv)
+    mdf['year'] = mdf.variable + '-01-01'
+    dims = dims + ('year',)
+
+    pdf = mdf.pivot_table(values='value', columns='Indicator Code', index=dims)
+    pdf = pdf.reset_index()
+    pdf.to_csv(DIR + '/countries.csv')
+    pdf.to_json(DIR + '/countries.json', orient='records')
+
+Here's the description of the metrics available:
+
+Series | Code Indicator Name
+--- | ---
+NY.GNP.PCAP.CD | GNI per capita, Atlas method (current US$)
+SE.ADT.1524.LT.FM.ZS | Literacy rate, youth (ages 15-24), gender parity index (GPI)
+SE.ADT.1524.LT.MA.ZS | Literacy rate, youth male (% of males ages 15-24)
+SE.ADT.1524.LT.ZS | Literacy rate, youth total (% of people ages 15-24)
+SE.ADT.LITR.FE.ZS | Literacy rate, adult female (% of females ages 15 and above)
+SE.ADT.LITR.MA.ZS | Literacy rate, adult male (% of males ages 15 and above)
+SE.ADT.LITR.ZS | Literacy rate, adult total (% of people ages 15 and above)
+SE.ENR.ORPH | Ratio of school attendance of orphans to school attendance of non-orphans ages 10-14
+SE.PRM.CMPT.FE.ZS | Primary completion rate, female (% of relevant age group)
+SE.PRM.CMPT.MA.ZS | Primary completion rate, male (% of relevant age group)
+SE.PRM.CMPT.ZS | Primary completion rate, total (% of relevant age group)
+SE.PRM.ENRR | School enrollment, primary (% gross)
+SE.PRM.ENRR.FE | School enrollment, primary, female (% gross)
+SE.PRM.ENRR.MA | School enrollment, primary, male (% gross)
+SE.PRM.NENR | School enrollment, primary (% net)
+SE.PRM.NENR.FE | School enrollment, primary, female (% net)
+SE.PRM.NENR.MA | School enrollment, primary, male (% net)
+SE.SEC.ENRR | School enrollment, secondary (% gross)
+SE.SEC.ENRR.FE | School enrollment, secondary, female (% gross)
+SE.SEC.ENRR.MA | School enrollment, secondary, male (% gross)
+SE.SEC.NENR | School enrollment, secondary (% net)
+SE.SEC.NENR.FE | School enrollment, secondary, female (% net)
+SE.SEC.NENR.MA | School enrollment, secondary, male (% net)
+SE.TER.ENRR | School enrollment, tertiary (% gross)
+SE.TER.ENRR.FE | School enrollment, tertiary, female (% gross)
+SE.XPD.TOTL.GD.ZS | Government expenditure on education, total (% of GDP)
+SH.ANM.CHLD.ZS | Prevalence of anemia among children (% of children under 5)
+SH.ANM.NPRG.ZS | Prevalence of anemia among non-pregnant women (% of women ages 15-49)
+SH.CON.1524.FE.ZS | Condom use, population ages 15-24, female (% of females ages 15-24)
+SH.CON.1524.MA.ZS | Condom use, population ages 15-24, male (% of males ages 15-24)
+SH.CON.AIDS.FE.ZS | Condom use at last high-risk sex, adult female (% ages 15-49)
+SH.CON.AIDS.MA.ZS | Condom use at last high-risk sex, adult male (% ages 15-49)
+SH.DTH.COMM.ZS | Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)
+SH.DTH.IMRT | Number of infant deaths
+SH.DTH.INJR.ZS | Cause of death, by injury (% of total)
+SH.DTH.MORT | Number of under-five deaths
+SH.DTH.NCOM.ZS | Cause of death, by non-communicable diseases (% of total)
+SH.DTH.NMRT | Number of neonatal deaths
+SH.DYN.AIDS | Adults (ages 15+) living with HIV
+SH.DYN.AIDS.DH | AIDS estimated deaths (UNAIDS estimates)
+SH.DYN.AIDS.FE.ZS | Women's share of population ages 15+ living with HIV (%)
+SH.DYN.AIDS.ZS | Prevalence of HIV, total (% of population ages 15-49)
+SH.DYN.MORT | Mortality rate, under-5 (per 1,000 live births)
+SH.DYN.MORT.FE | Mortality rate, under-5, female (per 1,000 live births)
+SH.DYN.MORT.MA | Mortality rate, under-5, male (per 1,000 live births)
+SH.DYN.NMRT | Mortality rate, neonatal (per 1,000 live births)
+SH.FPL.SATI.ZS | Met need for contraception (% of married women ages 15-49)
+SH.H2O.SAFE.RU.ZS | Improved water source, rural (% of rural population with access)
+SH.H2O.SAFE.UR.ZS | Improved water source, urban (% of urban population with access)
+SH.H2O.SAFE.ZS | Improved water source (% of population with access)
+SH.HIV.0014 | Children (0-14) living with HIV
+SH.HIV.1524.FE.ZS | Prevalence of HIV, female (% ages 15-24)
+SH.HIV.1524.KW.FE.ZS | Comprehensive correct knowledge of HIV/AIDS, ages 15-24, female (2 prevent ways and reject 3 misconceptions)
+SH.HIV.1524.KW.MA.ZS | Comprehensive correct knowledge of HIV/AIDS, ages 15-24, male (2 prevent ways and reject 3 misconceptions)
+SH.HIV.1524.MA.ZS | Prevalence of HIV, male (% ages 15-24)
+SH.HIV.ARTC.ZS | Antiretroviral therapy coverage (% of people living with HIV)
+SH.HIV.KNOW.FE.ZS | % of females ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)
+SH.HIV.KNOW.MA.ZS | % of males ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)
+SH.HIV.ORPH | Children orphaned by HIV/AIDS
+SH.HIV.TOTL | Adults (ages 15+) and children (0-14 years) living with HIV
+SH.IMM.HEPB | Immunization, HepB3 (% of one-year-old children)
+SH.IMM.HIB3 | Immunization, Hib3 (% of children ages 12-23 months)
+SH.IMM.IBCG | Immunization, BCG (% of one-year-old children)
+SH.IMM.IDPT | Immunization, DPT (% of children ages 12-23 months)
+SH.IMM.MEAS | Immunization, measles (% of children ages 12-23 months)
+SH.IMM.POL3 | Immunization, Pol3 (% of one-year-old children)
+SH.MED.BEDS.ZS | Hospital beds (per 1,000 people)
+SH.MED.CMHW.P3 | Community health workers (per 1,000 people)
+SH.MED.NUMW.P3 | Nurses and midwives (per 1,000 people)
+SH.MED.PHYS.ZS | Physicians (per 1,000 people)
+SH.MLR.NETS.ZS | Use of insecticide-treated bed nets (% of under-5 population)
+SH.MLR.PREG.ZS | Use of any antimalarial drug (% of pregnant women)
+SH.MLR.SPF2.ZS | Use of Intermittent Preventive Treatment of malaria, 2+ doses of SP/Fansidar (% of pregnant women)
+SH.MLR.TRET.ZS | Children with fever receiving antimalarial drugs (% of children under age 5 with fever)
+SH.MMR.DTHS | Number of maternal deaths
+SH.MMR.LEVE | Number of weeks of maternity leave
+SH.MMR.RISK | Lifetime risk of maternal death (1 in: rate varies by country)
+SH.MMR.RISK.ZS | Lifetime risk of maternal death (%)
+SH.MMR.WAGE.ZS | Maternal leave benefits (% of wages paid in covered period)
+SH.PRG.ANEM | Prevalence of anemia among pregnant women (%)
+SH.PRG.ARTC.ZS | Antiretroviral therapy coverage (% of pregnant women living with HIV)
+SH.PRG.SYPH.ZS | Prevalence of syphilis (% of women attending antenatal care)
+SH.PRV.SMOK.FE | Smoking prevalence, females (% of adults)
+SH.PRV.SMOK.MA | Smoking prevalence, males (% of adults)
+SH.STA.ACSN | Improved sanitation facilities (% of population with access)
+SH.STA.ACSN.RU | Improved sanitation facilities, rural (% of rural population with access)
+SH.STA.ACSN.UR | Improved sanitation facilities, urban (% of urban population with access)
+SH.STA.ANV4.ZS | Pregnant women receiving prenatal care of at least four visits (% of pregnant women)
+SH.STA.ANVC.ZS | Pregnant women receiving prenatal care (%)
+SH.STA.ARIC.ZS | ARI treatment (% of children under 5 taken to a health provider)
+SH.STA.BFED.ZS | Exclusive breastfeeding (% of children under 6 months)
+SH.STA.BRTC.ZS | Births attended by skilled health staff (% of total)
+SH.STA.BRTW.ZS | Low-birthweight babies (% of births)
+SH.STA.DIAB.ZS | Diabetes prevalence (% of population ages 20 to 79)
+SH.STA.IYCF.ZS | Infant and young child feeding practices, all 3 IYCF (% children ages 6-23 months)
+SH.STA.MALN.FE.ZS | Prevalence of underweight, weight for age, female (% of children under 5)
+SH.STA.MALN.MA.ZS | Prevalence of underweight, weight for age, male (% of children under 5)
+SH.STA.MALN.ZS | Prevalence of underweight, weight for age (% of children under 5)
+SH.STA.MALR | Malaria cases reported
+SH.STA.MMRT | Maternal mortality ratio (modeled estimate, per 100,000 live births)
+SH.STA.MMRT.NE | Maternal mortality ratio (national estimate, per 100,000 live births)
+SH.STA.ORCF.ZS | Diarrhea treatment (% of children under 5 receiving oral rehydration and continued feeding)
+SH.STA.ORTH | Diarrhea treatment (% of children under 5 who received ORS packet)
+SH.STA.OW15.FE.ZS | Prevalence of overweight, female (% of female adults)
+SH.STA.OW15.MA.ZS | Prevalence of overweight, male (% of male adults)
+SH.STA.OW15.ZS | Prevalence of overweight (% of adults)
+SH.STA.OWGH.FE.ZS | Prevalence of overweight, weight for height, female (% of children under 5)
+SH.STA.OWGH.MA.ZS | Prevalence of overweight, weight for height, male (% of children under 5)
+SH.STA.OWGH.ZS | Prevalence of overweight, weight for height (% of children under 5)
+SH.STA.PNVC.ZS | Postnatal care coverage (% mothers)
+SH.STA.STNT.FE.ZS | Prevalence of stunting, height for age, female (% of children under 5)
+SH.STA.STNT.MA.ZS | Prevalence of stunting, height for age, male (% of children under 5)
+SH.STA.STNT.ZS | Prevalence of stunting, height for age (% of children under 5)
+SH.STA.WAST.FE.ZS | Prevalence of wasting, weight for height, female (% of children under 5)
+SH.STA.WAST.MA.ZS | Prevalence of wasting, weight for height, male (% of children under 5)
+SH.STA.WAST.ZS | Prevalence of wasting, weight for height (% of children under 5)
+SH.SVR.WAST.FE.ZS | Prevalence of severe wasting, weight for height, female (% of children under 5)
+SH.SVR.WAST.MA.ZS | Prevalence of severe wasting, weight for height, male (% of children under 5)
+SH.SVR.WAST.ZS | Prevalence of severe wasting, weight for height (% of children under 5)
+SH.TBS.CURE.ZS | Tuberculosis treatment success rate (% of new cases)
+SH.TBS.DTEC.ZS | Tuberculosis case detection rate (%, all forms)
+SH.TBS.INCD | Incidence of tuberculosis (per 100,000 people)
+SH.TBS.MORT | Tuberculosis death rate (per 100,000 people)
+SH.TBS.PREV | Prevalence of tuberculosis (per 100,000 population)
+SH.VAC.TTNS.ZS | Newborns protected against tetanus (%)
+SH.XPD.EXTR.ZS | External resources for health (% of total expenditure on health)
+SH.XPD.OOPC.TO.ZS | Out-of-pocket health expenditure (% of total expenditure on health)
+SH.XPD.OOPC.ZS | Out-of-pocket health expenditure (% of private expenditure on health)
+SH.XPD.PCAP | Health expenditure per capita (current US$)
+SH.XPD.PCAP.PP.KD | Health expenditure per capita, PPP (constant 2011 international $)
+SH.XPD.PRIV | Health expenditure, private (% of total health expenditure)
+SH.XPD.PRIV.ZS | Health expenditure, private (% of GDP)
+SH.XPD.PUBL | Health expenditure, public (% of total health expenditure)
+SH.XPD.PUBL.GX.ZS | Health expenditure, public (% of government expenditure)
+SH.XPD.PUBL.ZS | Health expenditure, public (% of GDP)
+SH.XPD.TOTL.CD | Health expenditure, total (current US$)
+SH.XPD.TOTL.ZS | Health expenditure, total (% of GDP)
+SI.POV.NAHC | Poverty headcount ratio at national poverty lines (% of population)
+SI.POV.RUHC | Rural poverty headcount ratio at national poverty lines (% of rural population)
+SI.POV.URHC | Urban poverty headcount ratio at national poverty lines (% of urban population)
+SL.EMP.INSV.FE.ZS | Share of women in wage employment in the nonagricultural sector (% of total nonagricultural employment)
+SL.TLF.TOTL.FE.ZS | Labor force, female (% of total labor force)
+SL.TLF.TOTL.IN | Labor force, total
+SL.UEM.TOTL.FE.ZS | Unemployment, female (% of female labor force) (modeled ILO estimate)
+SL.UEM.TOTL.MA.ZS | Unemployment, male (% of male labor force) (modeled ILO estimate)
+SL.UEM.TOTL.ZS | Unemployment, total (% of total labor force) (modeled ILO estimate)
+SM.POP.NETM | Net migration
+SN.ITK.DEFC | Number of people who are undernourished
+SN.ITK.DEFC.ZS | Prevalence of undernourishment (% of population)
+SN.ITK.SALT.ZS | Consumption of iodized salt (% of households)
+SN.ITK.VITA.ZS | Vitamin A supplementation coverage rate (% of children ages 6-59 months)
+SP.ADO.TFRT | Adolescent fertility rate (births per 1,000 women ages 15-19)
+SP.DYN.AMRT.FE | Mortality rate, adult, female (per 1,000 female adults)
+SP.DYN.AMRT.MA | Mortality rate, adult, male (per 1,000 male adults)
+SP.DYN.CBRT.IN | Birth rate, crude (per 1,000 people)
+SP.DYN.CDRT.IN | Death rate, crude (per 1,000 people)
+SP.DYN.CONU.ZS | Contraceptive prevalence (% of women ages 15-49)
+SP.DYN.IMRT.FE.IN | Mortality rate, infant, female (per 1,000 live births)
+SP.DYN.IMRT.IN | Mortality rate, infant (per 1,000 live births)
+SP.DYN.IMRT.MA.IN | Mortality rate, infant, male (per 1,000 live births)
+SP.DYN.LE00.FE.IN | Life expectancy at birth, female (years)
+SP.DYN.LE00.IN | Life expectancy at birth, total (years)
+SP.DYN.LE00.MA.IN | Life expectancy at birth, male (years)
+SP.DYN.SMAM.FE | Mean age at first marriage, female
+SP.DYN.SMAM.MA | Mean age at first marriage, male
+SP.DYN.TFRT.IN | Fertility rate, total (births per woman)
+SP.DYN.TO65.FE.ZS | Survival to age 65, female (% of cohort)
+SP.DYN.TO65.MA.ZS | Survival to age 65, male (% of cohort)
+SP.DYN.WFRT | Wanted fertility rate (births per woman)
+SP.HOU.FEMA.ZS | Female headed households (% of households with a female head)
+SP.MTR.1519.ZS | Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)
+SP.POP.0004.FE | Population ages 0-4, female
+SP.POP.0004.FE.5Y | Population ages 0-4, female (% of female population)
+SP.POP.0004.MA | Population ages 0-4, male
+SP.POP.0004.MA.5Y | Population ages 0-4, male (% of male population)
+SP.POP.0014.FE.ZS | Population ages 0-14, female (% of total)
+SP.POP.0014.MA.ZS | Population ages 0-14, male (% of total)
+SP.POP.0014.TO | Population ages 0-14, total
+SP.POP.0014.TO.ZS | Population ages 0-14 (% of total)
+SP.POP.0509.FE | Population ages 5-9, female
+SP.POP.0509.FE.5Y | Population ages 5-9, female (% of female population)
+SP.POP.0509.MA | Population ages 5-9, male
+SP.POP.0509.MA.5Y | Population ages 5-9, male (% of male population)
+SP.POP.1014.FE | Population ages 10-14, female
+SP.POP.1014.FE.5Y | Population ages 10-14, female (% of female population)
+SP.POP.1014.MA | Population ages 10-14, male
+SP.POP.1014.MA.5Y | Population ages 10-14, male (% of male population)
+SP.POP.1519.FE | Population ages 15-19, female
+SP.POP.1519.FE.5Y | Population ages 15-19, female (% of female population)
+SP.POP.1519.MA | Population ages 15-19, male
+SP.POP.1519.MA.5Y | Population ages 15-19, male (% of male population)
+SP.POP.1564.FE.ZS | Population ages 15-64, female (% of total)
+SP.POP.1564.MA.ZS | Population ages 15-64, male (% of total)
+SP.POP.1564.TO | Population ages 15-64, total
+SP.POP.1564.TO.ZS | Population ages 15-64 (% of total)
+SP.POP.2024.FE | Population ages 20-24, female
+SP.POP.2024.FE.5Y | Population ages 20-24, female (% of female population)
+SP.POP.2024.MA | Population ages 20-24, male
+SP.POP.2024.MA.5Y | Population ages 20-24, male (% of male population)
+SP.POP.2529.FE | Population ages 25-29, female
+SP.POP.2529.FE.5Y | Population ages 25-29, female (% of female population)
+SP.POP.2529.MA | Population ages 25-29, male
+SP.POP.2529.MA.5Y | Population ages 25-29, male (% of male population)
+SP.POP.3034.FE | Population ages 30-34, female
+SP.POP.3034.FE.5Y | Population ages 30-34, female (% of female population)
+SP.POP.3034.MA | Population ages 30-34, male
+SP.POP.3034.MA.5Y | Population ages 30-34, male (% of male population)
+SP.POP.3539.FE | Population ages 35-39, female
+SP.POP.3539.FE.5Y | Population ages 35-39, female (% of female population)
+SP.POP.3539.MA | Population ages 35-39, male
+SP.POP.3539.MA.5Y | Population ages 35-39, male (% of male population)
+SP.POP.4044.FE | Population ages 40-44, female
+SP.POP.4044.FE.5Y | Population ages 40-44, female (% of female population)
+SP.POP.4044.MA | Population ages 40-44, male
+SP.POP.4044.MA.5Y | Population ages 40-44, male (% of male population)
+SP.POP.4549.FE | Population ages 45-49, female
+SP.POP.4549.FE.5Y | Population ages 45-49, female (% of female population)
+SP.POP.4549.MA | Population ages 45-49, male
+SP.POP.4549.MA.5Y | Population ages 45-49, male (% of male population)
+SP.POP.5054.FE | Population ages 50-54, female
+SP.POP.5054.FE.5Y | Population ages 50-54, female (% of female population)
+SP.POP.5054.MA | Population ages 50-54, male
+SP.POP.5054.MA.5Y | Population ages 50-54, male (% of male population)
+SP.POP.5559.FE | Population ages 55-59, female
+SP.POP.5559.FE.5Y | Population ages 55-59, female (% of female population)
+SP.POP.5559.MA | Population ages 55-59, male
+SP.POP.5559.MA.5Y | Population ages 55-59, male (% of male population)
+SP.POP.6064.FE | Population ages 60-64, female
+SP.POP.6064.FE.5Y | Population ages 60-64, female (% of female population)
+SP.POP.6064.MA | Population ages 60-64, male
+SP.POP.6064.MA.5Y | Population ages 60-64, male (% of male population)
+SP.POP.6569.FE | Population ages 65-69, female
+SP.POP.6569.FE.5Y | Population ages 65-69, female (% of female population)
+SP.POP.6569.MA | Population ages 65-69, male
+SP.POP.6569.MA.5Y | Population ages 65-69, male (% of male population)
+SP.POP.65UP.FE.ZS | Population ages 65 and above, female (% of total)
+SP.POP.65UP.MA.ZS | Population ages 65 and above, male (% of total)
+SP.POP.65UP.TO | Population ages 65 and above, total
+SP.POP.65UP.TO.ZS | Population ages 65 and above (% of total)
+SP.POP.7074.FE | Population ages 70-74, female
+SP.POP.7074.FE.5Y | Population ages 70-74, female (% of female population)
+SP.POP.7074.MA | Population ages 70-74, male
+SP.POP.7074.MA.5Y | Population ages 70-74, male (% of male population)
+SP.POP.7579.FE | Population ages 75-79, female
+SP.POP.7579.FE.5Y | Population ages 75-79, female (% of female population)
+SP.POP.7579.MA | Population ages 75-79, male
+SP.POP.7579.MA.5Y | Population ages 75-79, male (% of male population)
+SP.POP.80UP.FE | Population ages 80 and above, female
+SP.POP.80UP.FE.5Y | Population ages 80 and above, female (% of female population)
+SP.POP.80UP.MA | Population ages 80 and above, male
+SP.POP.80UP.MA.5Y | Population ages 80 and above, male (% of male population)
+SP.POP.AG00.FE.IN | Age population, age 0, female, interpolated
+SP.POP.AG00.MA.IN | Age population, age 0, male, interpolated
+SP.POP.AG01.FE.IN | Age population, age 01, female, interpolated
+SP.POP.AG01.MA.IN | Age population, age 01, male, interpolated
+SP.POP.AG02.FE.IN | Age population, age 02, female, interpolated
+SP.POP.AG02.MA.IN | Age population, age 02, male, interpolated
+SP.POP.AG03.FE.IN | Age population, age 03, female, interpolated
+SP.POP.AG03.MA.IN | Age population, age 03, male, interpolated
+SP.POP.AG04.FE.IN | Age population, age 04, female, interpolated
+SP.POP.AG04.MA.IN | Age population, age 04, male, interpolated
+SP.POP.AG05.FE.IN | Age population, age 05, female, interpolated
+SP.POP.AG05.MA.IN | Age population, age 05, male, interpolated
+SP.POP.AG06.FE.IN | Age population, age 06, female, interpolated
+SP.POP.AG06.MA.IN | Age population, age 06, male, interpolated
+SP.POP.AG07.FE.IN | Age population, age 07, female, interpolated
+SP.POP.AG07.MA.IN | Age population, age 07, male, interpolated
+SP.POP.AG08.FE.IN | Age population, age 08, female, interpolated
+SP.POP.AG08.MA.IN | Age population, age 08, male, interpolated
+SP.POP.AG09.FE.IN | Age population, age 09, female, interpolated
+SP.POP.AG09.MA.IN | Age population, age 09, male, interpolated
+SP.POP.AG10.FE.IN | Age population, age 10, female, interpolated
+SP.POP.AG10.MA.IN | Age population, age 10, male
+SP.POP.AG11.FE.IN | Age population, age 11, female, interpolated
+SP.POP.AG11.MA.IN | Age population, age 11, male
+SP.POP.AG12.FE.IN | Age population, age 12, female, interpolated
+SP.POP.AG12.MA.IN | Age population, age 12, male
+SP.POP.AG13.FE.IN | Age population, age 13, female, interpolated
+SP.POP.AG13.MA.IN | Age population, age 13, male
+SP.POP.AG14.FE.IN | Age population, age 14, female, interpolated
+SP.POP.AG14.MA.IN | Age population, age 14, male
+SP.POP.AG15.FE.IN | Age population, age 15, female, interpolated
+SP.POP.AG15.MA.IN | Age population, age 15, male, interpolated
+SP.POP.AG16.FE.IN | Age population, age 16, female, interpolated
+SP.POP.AG16.MA.IN | Age population, age 16, male, interpolated
+SP.POP.AG17.FE.IN | Age population, age 17, female, interpolated
+SP.POP.AG17.MA.IN | Age population, age 17, male, interpolated
+SP.POP.AG18.FE.IN | Age population, age 18, female, interpolated
+SP.POP.AG18.MA.IN | Age population, age 18, male, interpolated
+SP.POP.AG19.FE.IN | Age population, age 19, female, interpolated
+SP.POP.AG19.MA.IN | Age population, age 19, male, interpolated
+SP.POP.AG20.FE.IN | Age population, age 20, female, interpolated
+SP.POP.AG20.MA.IN | Age population, age 20, male, interpolated
+SP.POP.AG21.FE.IN | Age population, age 21, female, interpolated
+SP.POP.AG21.MA.IN | Age population, age 21, male, interpolated
+SP.POP.AG22.FE.IN | Age population, age 22, female, interpolated
+SP.POP.AG22.MA.IN | Age population, age 22, male, interpolated
+SP.POP.AG23.FE.IN | Age population, age 23, female, interpolated
+SP.POP.AG23.MA.IN | Age population, age 23, male, interpolated
+SP.POP.AG24.FE.IN | Age population, age 24, female, interpolated
+SP.POP.AG24.MA.IN | Age population, age 24, male, interpolated
+SP.POP.AG25.FE.IN | Age population, age 25, female, interpolated
+SP.POP.AG25.MA.IN | Age population, age 25, male, interpolated
+SP.POP.BRTH.MF | Sex ratio at birth (male births per female births)
+SP.POP.DPND | Age dependency ratio (% of working-age population)
+SP.POP.DPND.OL | Age dependency ratio, old (% of working-age population)
+SP.POP.DPND.YG | Age dependency ratio, young (% of working-age population)
+SP.POP.GROW | Population growth (annual %)
+SP.POP.TOTL | Population, total
+SP.POP.TOTL.FE.IN | Population, female
+SP.POP.TOTL.FE.ZS | Population, female (% of total)
+SP.POP.TOTL.MA.IN | Population, male
+SP.POP.TOTL.MA.ZS | Population, male (% of total)
+SP.REG.BRTH.RU.ZS | Completeness of birth registration, rural (%)
+SP.REG.BRTH.UR.ZS | Completeness of birth registration, urban (%)
+SP.REG.BRTH.ZS | Completeness of birth registration (%)
+SP.REG.DTHS.ZS | Completeness of death registration with cause-of-death information (%)
+SP.RUR.TOTL | Rural population
+SP.RUR.TOTL.ZG | Rural population growth (annual %)
+SP.RUR.TOTL.ZS | Rural population (% of total population)
+SP.URB.GROW | Urban population growth (annual %)
+SP.URB.TOTL | Urban population
+SP.URB.TOTL.IN.ZS | Urban population (% of total)
+SP.UWT.TFRT | Unmet need for contraception (% of married women ages 15-49)

+ 2505 - 0
BI/examples/countries.py

@@ -0,0 +1,2505 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This module contains data related to countries and is used for geo mapping"""
+from typing import Any, Dict, List, Optional
+
+countries: List[Dict[str, Any]] = [
+    {
+        "name": "Angola",
+        "area": 1246700,
+        "cioc": "ANG",
+        "cca2": "AO",
+        "capital": "Luanda",
+        "lat": -12.5,
+        "lng": 18.5,
+        "cca3": "AGO",
+    },
+    {
+        "name": "Algeria",
+        "area": 2381741,
+        "cioc": "ALG",
+        "cca2": "DZ",
+        "capital": "Algiers",
+        "lat": 28,
+        "lng": 3,
+        "cca3": "DZA",
+    },
+    {
+        "name": "Egypt",
+        "area": 1002450,
+        "cioc": "EGY",
+        "cca2": "EG",
+        "capital": "Cairo",
+        "lat": 27,
+        "lng": 30,
+        "cca3": "EGY",
+    },
+    {
+        "name": "Bangladesh",
+        "area": 147570,
+        "cioc": "BAN",
+        "cca2": "BD",
+        "capital": "Dhaka",
+        "lat": 24,
+        "lng": 90,
+        "cca3": "BGD",
+    },
+    {
+        "name": "Niger",
+        "area": 1267000,
+        "cioc": "NIG",
+        "cca2": "NE",
+        "capital": "Niamey",
+        "lat": 16,
+        "lng": 8,
+        "cca3": "NER",
+    },
+    {
+        "name": "Liechtenstein",
+        "area": 160,
+        "cioc": "LIE",
+        "cca2": "LI",
+        "capital": "Vaduz",
+        "lat": 47.26666666,
+        "lng": 9.53333333,
+        "cca3": "LIE",
+    },
+    {
+        "name": "Namibia",
+        "area": 825615,
+        "cioc": "NAM",
+        "cca2": "NA",
+        "capital": "Windhoek",
+        "lat": -22,
+        "lng": 17,
+        "cca3": "NAM",
+    },
+    {
+        "name": "Bulgaria",
+        "area": 110879,
+        "cioc": "BUL",
+        "cca2": "BG",
+        "capital": "Sofia",
+        "lat": 43,
+        "lng": 25,
+        "cca3": "BGR",
+    },
+    {
+        "name": "Bolivia",
+        "area": 1098581,
+        "cioc": "BOL",
+        "cca2": "BO",
+        "capital": "Sucre",
+        "lat": -17,
+        "lng": -65,
+        "cca3": "BOL",
+    },
+    {
+        "name": "Ghana",
+        "area": 238533,
+        "cioc": "GHA",
+        "cca2": "GH",
+        "capital": "Accra",
+        "lat": 8,
+        "lng": -2,
+        "cca3": "GHA",
+    },
+    {
+        "name": "Cocos (Keeling) Islands",
+        "area": 14,
+        "cioc": "",
+        "cca2": "CC",
+        "capital": "West Island",
+        "lat": -12.5,
+        "lng": 96.83333333,
+        "cca3": "CCK",
+    },
+    {
+        "name": "Pakistan",
+        "area": 881912,
+        "cioc": "PAK",
+        "cca2": "PK",
+        "capital": "Islamabad",
+        "lat": 30,
+        "lng": 70,
+        "cca3": "PAK",
+    },
+    {
+        "name": "Cape Verde",
+        "area": 4033,
+        "cioc": "CPV",
+        "cca2": "CV",
+        "capital": "Praia",
+        "lat": 16,
+        "lng": -24,
+        "cca3": "CPV",
+    },
+    {
+        "name": "Jordan",
+        "area": 89342,
+        "cioc": "JOR",
+        "cca2": "JO",
+        "capital": "Amman",
+        "lat": 31,
+        "lng": 36,
+        "cca3": "JOR",
+    },
+    {
+        "name": "Liberia",
+        "area": 111369,
+        "cioc": "LBR",
+        "cca2": "LR",
+        "capital": "Monrovia",
+        "lat": 6.5,
+        "lng": -9.5,
+        "cca3": "LBR",
+    },
+    {
+        "name": "Libya",
+        "area": 1759540,
+        "cioc": "LBA",
+        "cca2": "LY",
+        "capital": "Tripoli",
+        "lat": 25,
+        "lng": 17,
+        "cca3": "LBY",
+    },
+    {
+        "name": "Malaysia",
+        "area": 330803,
+        "cioc": "MAS",
+        "cca2": "MY",
+        "capital": "Kuala Lumpur",
+        "lat": 2.5,
+        "lng": 112.5,
+        "cca3": "MYS",
+    },
+    {
+        "name": "Dominican Republic",
+        "area": 48671,
+        "cioc": "DOM",
+        "cca2": "DO",
+        "capital": "Santo Domingo",
+        "lat": 19,
+        "lng": -70.66666666,
+        "cca3": "DOM",
+    },
+    {
+        "name": "Puerto Rico",
+        "area": 8870,
+        "cioc": "PUR",
+        "cca2": "PR",
+        "capital": "San Juan",
+        "lat": 18.25,
+        "lng": -66.5,
+        "cca3": "PRI",
+    },
+    {
+        "name": "Mayotte",
+        "area": 374,
+        "cioc": "",
+        "cca2": "YT",
+        "capital": "Mamoudzou",
+        "lat": -12.83333333,
+        "lng": 45.16666666,
+        "cca3": "MYT",
+    },
+    {
+        "name": "North Korea",
+        "area": 120538,
+        "cioc": "PRK",
+        "cca2": "KP",
+        "capital": "Pyongyang",
+        "lat": 40,
+        "lng": 127,
+        "cca3": "PRK",
+    },
+    {
+        "name": "Palestine",
+        "area": 6220,
+        "cioc": "PLE",
+        "cca2": "PS",
+        "capital": "Ramallah",
+        "lat": 31.9,
+        "lng": 35.2,
+        "cca3": "PSE",
+    },
+    {
+        "name": "Tanzania",
+        "area": 945087,
+        "cioc": "TAN",
+        "cca2": "TZ",
+        "capital": "Dodoma",
+        "lat": -6,
+        "lng": 35,
+        "cca3": "TZA",
+    },
+    {
+        "name": "Botswana",
+        "area": 582000,
+        "cioc": "BOT",
+        "cca2": "BW",
+        "capital": "Gaborone",
+        "lat": -22,
+        "lng": 24,
+        "cca3": "BWA",
+    },
+    {
+        "name": "Cambodia",
+        "area": 181035,
+        "cioc": "CAM",
+        "cca2": "KH",
+        "capital": "Phnom Penh",
+        "lat": 13,
+        "lng": 105,
+        "cca3": "KHM",
+    },
+    {
+        "name": "Nicaragua",
+        "area": 130373,
+        "cioc": "NCA",
+        "cca2": "NI",
+        "capital": "Managua",
+        "lat": 13,
+        "lng": -85,
+        "cca3": "NIC",
+    },
+    {
+        "name": "Trinidad and Tobago",
+        "area": 5130,
+        "cioc": "TTO",
+        "cca2": "TT",
+        "capital": "Port of Spain",
+        "lat": 11,
+        "lng": -61,
+        "cca3": "TTO",
+    },
+    {
+        "name": "Ethiopia",
+        "area": 1104300,
+        "cioc": "ETH",
+        "cca2": "ET",
+        "capital": "Addis Ababa",
+        "lat": 8,
+        "lng": 38,
+        "cca3": "ETH",
+    },
+    {
+        "name": "Paraguay",
+        "area": 406752,
+        "cioc": "PAR",
+        "cca2": "PY",
+        "capital": "Asuncion",
+        "lat": -23,
+        "lng": -58,
+        "cca3": "PRY",
+    },
+    {
+        "name": "Hong Kong",
+        "area": 1104,
+        "cioc": "HKG",
+        "cca2": "HK",
+        "capital": "City of Victoria",
+        "lat": 22.267,
+        "lng": 114.188,
+        "cca3": "HKG",
+    },
+    {
+        "name": "Saudi Arabia",
+        "area": 2149690,
+        "cioc": "KSA",
+        "cca2": "SA",
+        "capital": "Riyadh",
+        "lat": 25,
+        "lng": 45,
+        "cca3": "SAU",
+    },
+    {
+        "name": "Lebanon",
+        "area": 10452,
+        "cioc": "LIB",
+        "cca2": "LB",
+        "capital": "Beirut",
+        "lat": 33.83333333,
+        "lng": 35.83333333,
+        "cca3": "LBN",
+    },
+    {
+        "name": "Slovenia",
+        "area": 20273,
+        "cioc": "SLO",
+        "cca2": "SI",
+        "capital": "Ljubljana",
+        "lat": 46.11666666,
+        "lng": 14.81666666,
+        "cca3": "SVN",
+    },
+    {
+        "name": "Burkina Faso",
+        "area": 272967,
+        "cioc": "BUR",
+        "cca2": "BF",
+        "capital": "Ouagadougou",
+        "lat": 13,
+        "lng": -2,
+        "cca3": "BFA",
+    },
+    {
+        "name": "Switzerland",
+        "area": 41284,
+        "cioc": "SUI",
+        "cca2": "CH",
+        "capital": "Bern",
+        "lat": 47,
+        "lng": 8,
+        "cca3": "CHE",
+    },
+    {
+        "name": "Mauritania",
+        "area": 1030700,
+        "cioc": "MTN",
+        "cca2": "MR",
+        "capital": "Nouakchott",
+        "lat": 20,
+        "lng": -12,
+        "cca3": "MRT",
+    },
+    {
+        "name": "Croatia",
+        "area": 56594,
+        "cioc": "CRO",
+        "cca2": "HR",
+        "capital": "Zagreb",
+        "lat": 45.16666666,
+        "lng": 15.5,
+        "cca3": "HRV",
+    },
+    {
+        "name": "Chile",
+        "area": 756102,
+        "cioc": "CHI",
+        "cca2": "CL",
+        "capital": "Santiago",
+        "lat": -30,
+        "lng": -71,
+        "cca3": "CHL",
+    },
+    {
+        "name": "China",
+        "area": 9706961,
+        "cioc": "CHN",
+        "cca2": "CN",
+        "capital": "Beijing",
+        "lat": 35,
+        "lng": 105,
+        "cca3": "CHN",
+    },
+    {
+        "name": "Saint Kitts and Nevis",
+        "area": 261,
+        "cioc": "SKN",
+        "cca2": "KN",
+        "capital": "Basseterre",
+        "lat": 17.33333333,
+        "lng": -62.75,
+        "cca3": "KNA",
+    },
+    {
+        "name": "Sierra Leone",
+        "area": 71740,
+        "cioc": "SLE",
+        "cca2": "SL",
+        "capital": "Freetown",
+        "lat": 8.5,
+        "lng": -11.5,
+        "cca3": "SLE",
+    },
+    {
+        "name": "Jamaica",
+        "area": 10991,
+        "cioc": "JAM",
+        "cca2": "JM",
+        "capital": "Kingston",
+        "lat": 18.25,
+        "lng": -77.5,
+        "cca3": "JAM",
+    },
+    {
+        "name": "San Marino",
+        "area": 61,
+        "cioc": "SMR",
+        "cca2": "SM",
+        "capital": "City of San Marino",
+        "lat": 43.76666666,
+        "lng": 12.41666666,
+        "cca3": "SMR",
+    },
+    {
+        "name": "Gibraltar",
+        "area": 6,
+        "cioc": "",
+        "cca2": "GI",
+        "capital": "Gibraltar",
+        "lat": 36.13333333,
+        "lng": -5.35,
+        "cca3": "GIB",
+    },
+    {
+        "name": "Djibouti",
+        "area": 23200,
+        "cioc": "DJI",
+        "cca2": "DJ",
+        "capital": "Djibouti",
+        "lat": 11.5,
+        "lng": 43,
+        "cca3": "DJI",
+    },
+    {
+        "name": "Guinea",
+        "area": 245857,
+        "cioc": "GUI",
+        "cca2": "GN",
+        "capital": "Conakry",
+        "lat": 11,
+        "lng": -10,
+        "cca3": "GIN",
+    },
+    {
+        "name": "Finland",
+        "area": 338424,
+        "cioc": "FIN",
+        "cca2": "FI",
+        "capital": "Helsinki",
+        "lat": 64,
+        "lng": 26,
+        "cca3": "FIN",
+    },
+    {
+        "name": "Uruguay",
+        "area": 181034,
+        "cioc": "URU",
+        "cca2": "UY",
+        "capital": "Montevideo",
+        "lat": -33,
+        "lng": -56,
+        "cca3": "URY",
+    },
+    {
+        "name": "Thailand",
+        "area": 513120,
+        "cioc": "THA",
+        "cca2": "TH",
+        "capital": "Bangkok",
+        "lat": 15,
+        "lng": 100,
+        "cca3": "THA",
+    },
+    {
+        "name": "Sao Tome and Principe",
+        "area": 964,
+        "cioc": "STP",
+        "cca2": "ST",
+        "capital": "Sao Tome",
+        "lat": 1,
+        "lng": 7,
+        "cca3": "STP",
+    },
+    {
+        "name": "Seychelles",
+        "area": 452,
+        "cioc": "SEY",
+        "cca2": "SC",
+        "capital": "Victoria",
+        "lat": -4.58333333,
+        "lng": 55.66666666,
+        "cca3": "SYC",
+    },
+    {
+        "name": "Nepal",
+        "area": 147181,
+        "cioc": "NEP",
+        "cca2": "NP",
+        "capital": "Kathmandu",
+        "lat": 28,
+        "lng": 84,
+        "cca3": "NPL",
+    },
+    {
+        "name": "Christmas Island",
+        "area": 135,
+        "cioc": "",
+        "cca2": "CX",
+        "capital": "Flying Fish Cove",
+        "lat": -10.5,
+        "lng": 105.66666666,
+        "cca3": "CXR",
+    },
+    {
+        "name": "Laos",
+        "area": 236800,
+        "cioc": "LAO",
+        "cca2": "LA",
+        "capital": "Vientiane",
+        "lat": 18,
+        "lng": 105,
+        "cca3": "LAO",
+    },
+    {
+        "name": "Yemen",
+        "area": 527968,
+        "cioc": "YEM",
+        "cca2": "YE",
+        "capital": "Sana'a",
+        "lat": 15,
+        "lng": 48,
+        "cca3": "YEM",
+    },
+    {
+        "name": "Bouvet Island",
+        "area": 49,
+        "cioc": "",
+        "cca2": "BV",
+        "capital": "",
+        "lat": -54.43333333,
+        "lng": 3.4,
+        "cca3": "BVT",
+    },
+    {
+        "name": "South Africa",
+        "area": 1221037,
+        "cioc": "RSA",
+        "cca2": "ZA",
+        "capital": "Pretoria",
+        "lat": -29,
+        "lng": 24,
+        "cca3": "ZAF",
+    },
+    {
+        "name": "Kiribati",
+        "area": 811,
+        "cioc": "KIR",
+        "cca2": "KI",
+        "capital": "South Tarawa",
+        "lat": 1.41666666,
+        "lng": 173,
+        "cca3": "KIR",
+    },
+    {
+        "name": "Philippines",
+        "area": 342353,
+        "cioc": "PHI",
+        "cca2": "PH",
+        "capital": "Manila",
+        "lat": 13,
+        "lng": 122,
+        "cca3": "PHL",
+    },
+    {
+        "name": "Sint Maarten",
+        "area": 34,
+        "cioc": "",
+        "cca2": "SX",
+        "capital": "Philipsburg",
+        "lat": 18.033333,
+        "lng": -63.05,
+        "cca3": "SXM",
+    },
+    {
+        "name": "Romania",
+        "area": 238391,
+        "cioc": "ROU",
+        "cca2": "RO",
+        "capital": "Bucharest",
+        "lat": 46,
+        "lng": 25,
+        "cca3": "ROU",
+    },
+    {
+        "name": "United States Virgin Islands",
+        "area": 347,
+        "cioc": "ISV",
+        "cca2": "VI",
+        "capital": "Charlotte Amalie",
+        "lat": 18.35,
+        "lng": -64.933333,
+        "cca3": "VIR",
+    },
+    {
+        "name": "Syria",
+        "area": 185180,
+        "cioc": "SYR",
+        "cca2": "SY",
+        "capital": "Damascus",
+        "lat": 35,
+        "lng": 38,
+        "cca3": "SYR",
+    },
+    {
+        "name": "Macau",
+        "area": 30,
+        "cioc": "",
+        "cca2": "MO",
+        "capital": "",
+        "lat": 22.16666666,
+        "lng": 113.55,
+        "cca3": "MAC",
+    },
+    {
+        "name": "Saint Martin",
+        "area": 53,
+        "cioc": "",
+        "cca2": "MF",
+        "capital": "Marigot",
+        "lat": 18.08333333,
+        "lng": -63.95,
+        "cca3": "MAF",
+    },
+    {
+        "name": "Malta",
+        "area": 316,
+        "cioc": "MLT",
+        "cca2": "MT",
+        "capital": "Valletta",
+        "lat": 35.83333333,
+        "lng": 14.58333333,
+        "cca3": "MLT",
+    },
+    {
+        "name": "Kazakhstan",
+        "area": 2724900,
+        "cioc": "KAZ",
+        "cca2": "KZ",
+        "capital": "Astana",
+        "lat": 48,
+        "lng": 68,
+        "cca3": "KAZ",
+    },
+    {
+        "name": "Turks and Caicos Islands",
+        "area": 948,
+        "cioc": "",
+        "cca2": "TC",
+        "capital": "Cockburn Town",
+        "lat": 21.75,
+        "lng": -71.58333333,
+        "cca3": "TCA",
+    },
+    {
+        "name": "French Polynesia",
+        "area": 4167,
+        "cioc": "",
+        "cca2": "PF",
+        "capital": "Papeete",
+        "lat": -15,
+        "lng": -140,
+        "cca3": "PYF",
+    },
+    {
+        "name": "Niue",
+        "area": 260,
+        "cioc": "",
+        "cca2": "NU",
+        "capital": "Alofi",
+        "lat": -19.03333333,
+        "lng": -169.86666666,
+        "cca3": "NIU",
+    },
+    {
+        "name": "Dominica",
+        "area": 751,
+        "cioc": "DMA",
+        "cca2": "DM",
+        "capital": "Roseau",
+        "lat": 15.41666666,
+        "lng": -61.33333333,
+        "cca3": "DMA",
+    },
+    {
+        "name": "Benin",
+        "area": 112622,
+        "cioc": "BEN",
+        "cca2": "BJ",
+        "capital": "Porto-Novo",
+        "lat": 9.5,
+        "lng": 2.25,
+        "cca3": "BEN",
+    },
+    {
+        "name": "French Guiana",
+        "area": 83534,
+        "cioc": "",
+        "cca2": "GF",
+        "capital": "Cayenne",
+        "lat": 4,
+        "lng": -53,
+        "cca3": "GUF",
+    },
+    {
+        "name": "Belgium",
+        "area": 30528,
+        "cioc": "BEL",
+        "cca2": "BE",
+        "capital": "Brussels",
+        "lat": 50.83333333,
+        "lng": 4,
+        "cca3": "BEL",
+    },
+    {
+        "name": "Montserrat",
+        "area": 102,
+        "cioc": "",
+        "cca2": "MS",
+        "capital": "Plymouth",
+        "lat": 16.75,
+        "lng": -62.2,
+        "cca3": "MSR",
+    },
+    {
+        "name": "Togo",
+        "area": 56785,
+        "cioc": "TOG",
+        "cca2": "TG",
+        "capital": "Lome",
+        "lat": 8,
+        "lng": 1.16666666,
+        "cca3": "TGO",
+    },
+    {
+        "name": "Germany",
+        "area": 357114,
+        "cioc": "GER",
+        "cca2": "DE",
+        "capital": "Berlin",
+        "lat": 51,
+        "lng": 9,
+        "cca3": "DEU",
+    },
+    {
+        "name": "Guam",
+        "area": 549,
+        "cioc": "GUM",
+        "cca2": "GU",
+        "capital": "Hagatna",
+        "lat": 13.46666666,
+        "lng": 144.78333333,
+        "cca3": "GUM",
+    },
+    {
+        "name": "Sri Lanka",
+        "area": 65610,
+        "cioc": "SRI",
+        "cca2": "LK",
+        "capital": "Colombo",
+        "lat": 7,
+        "lng": 81,
+        "cca3": "LKA",
+    },
+    {
+        "name": "South Sudan",
+        "area": 619745,
+        "cioc": "",
+        "cca2": "SS",
+        "capital": "Juba",
+        "lat": 7,
+        "lng": 30,
+        "cca3": "SSD",
+    },
+    {
+        "name": "Falkland Islands",
+        "area": 12173,
+        "cioc": "",
+        "cca2": "FK",
+        "capital": "Stanley",
+        "lat": -51.75,
+        "lng": -59,
+        "cca3": "FLK",
+    },
+    {
+        "name": "United Kingdom",
+        "area": 242900,
+        "cioc": "GBR",
+        "cca2": "GB",
+        "capital": "London",
+        "lat": 54,
+        "lng": -2,
+        "cca3": "GBR",
+    },
+    {
+        "name": "Guyana",
+        "area": 214969,
+        "cioc": "GUY",
+        "cca2": "GY",
+        "capital": "Georgetown",
+        "lat": 5,
+        "lng": -59,
+        "cca3": "GUY",
+    },
+    {
+        "name": "Costa Rica",
+        "area": 51100,
+        "cioc": "CRC",
+        "cca2": "CR",
+        "capital": "San Jose",
+        "lat": 10,
+        "lng": -84,
+        "cca3": "CRI",
+    },
+    {
+        "name": "Cameroon",
+        "area": 475442,
+        "cioc": "CMR",
+        "cca2": "CM",
+        "capital": "Yaounde",
+        "lat": 6,
+        "lng": 12,
+        "cca3": "CMR",
+    },
+    {
+        "name": "Morocco",
+        "area": 446550,
+        "cioc": "MAR",
+        "cca2": "MA",
+        "capital": "Rabat",
+        "lat": 32,
+        "lng": -5,
+        "cca3": "MAR",
+    },
+    {
+        "name": "Northern Mariana Islands",
+        "area": 464,
+        "cioc": "",
+        "cca2": "MP",
+        "capital": "Saipan",
+        "lat": 15.2,
+        "lng": 145.75,
+        "cca3": "MNP",
+    },
+    {
+        "name": "Lesotho",
+        "area": 30355,
+        "cioc": "LES",
+        "cca2": "LS",
+        "capital": "Maseru",
+        "lat": -29.5,
+        "lng": 28.5,
+        "cca3": "LSO",
+    },
+    {
+        "name": "Hungary",
+        "area": 93028,
+        "cioc": "HUN",
+        "cca2": "HU",
+        "capital": "Budapest",
+        "lat": 47,
+        "lng": 20,
+        "cca3": "HUN",
+    },
+    {
+        "name": "Turkmenistan",
+        "area": 488100,
+        "cioc": "TKM",
+        "cca2": "TM",
+        "capital": "Ashgabat",
+        "lat": 40,
+        "lng": 60,
+        "cca3": "TKM",
+    },
+    {
+        "name": "Suriname",
+        "area": 163820,
+        "cioc": "SUR",
+        "cca2": "SR",
+        "capital": "Paramaribo",
+        "lat": 4,
+        "lng": -56,
+        "cca3": "SUR",
+    },
+    {
+        "name": "Netherlands",
+        "area": 41850,
+        "cioc": "NED",
+        "cca2": "NL",
+        "capital": "Amsterdam",
+        "lat": 52.5,
+        "lng": 5.75,
+        "cca3": "NLD",
+    },
+    {
+        "name": "Bermuda",
+        "area": 54,
+        "cioc": "BER",
+        "cca2": "BM",
+        "capital": "Hamilton",
+        "lat": 32.33333333,
+        "lng": -64.75,
+        "cca3": "BMU",
+    },
+    {
+        "name": "Heard Island and McDonald Islands",
+        "area": 412,
+        "cioc": "",
+        "cca2": "HM",
+        "capital": "",
+        "lat": -53.1,
+        "lng": 72.51666666,
+        "cca3": "HMD",
+    },
+    {
+        "name": "Chad",
+        "area": 1284000,
+        "cioc": "CHA",
+        "cca2": "TD",
+        "capital": "N'Djamena",
+        "lat": 15,
+        "lng": 19,
+        "cca3": "TCD",
+    },
+    {
+        "name": "Georgia",
+        "area": 69700,
+        "cioc": "GEO",
+        "cca2": "GE",
+        "capital": "Tbilisi",
+        "lat": 42,
+        "lng": 43.5,
+        "cca3": "GEO",
+    },
+    {
+        "name": "Montenegro",
+        "area": 13812,
+        "cioc": "MNE",
+        "cca2": "ME",
+        "capital": "Podgorica",
+        "lat": 42.5,
+        "lng": 19.3,
+        "cca3": "MNE",
+    },
+    {
+        "name": "Mongolia",
+        "area": 1564110,
+        "cioc": "MGL",
+        "cca2": "MN",
+        "capital": "Ulan Bator",
+        "lat": 46,
+        "lng": 105,
+        "cca3": "MNG",
+    },
+    {
+        "name": "Marshall Islands",
+        "area": 181,
+        "cioc": "MHL",
+        "cca2": "MH",
+        "capital": "Majuro",
+        "lat": 9,
+        "lng": 168,
+        "cca3": "MHL",
+    },
+    {
+        "name": "Martinique",
+        "area": 1128,
+        "cioc": "",
+        "cca2": "MQ",
+        "capital": "Fort-de-France",
+        "lat": 14.666667,
+        "lng": -61,
+        "cca3": "MTQ",
+    },
+    {
+        "name": "Belize",
+        "area": 22966,
+        "cioc": "BIZ",
+        "cca2": "BZ",
+        "capital": "Belmopan",
+        "lat": 17.25,
+        "lng": -88.75,
+        "cca3": "BLZ",
+    },
+    {
+        "name": "Norfolk Island",
+        "area": 36,
+        "cioc": "",
+        "cca2": "NF",
+        "capital": "Kingston",
+        "lat": -29.03333333,
+        "lng": 167.95,
+        "cca3": "NFK",
+    },
+    {
+        "name": "Myanmar",
+        "area": 676578,
+        "cioc": "MYA",
+        "cca2": "MM",
+        "capital": "Naypyidaw",
+        "lat": 22,
+        "lng": 98,
+        "cca3": "MMR",
+    },
+    {
+        "name": "Afghanistan",
+        "area": 652230,
+        "cioc": "AFG",
+        "cca2": "AF",
+        "capital": "Kabul",
+        "lat": 33,
+        "lng": 65,
+        "cca3": "AFG",
+    },
+    {
+        "name": "Burundi",
+        "area": 27834,
+        "cioc": "BDI",
+        "cca2": "BI",
+        "capital": "Bujumbura",
+        "lat": -3.5,
+        "lng": 30,
+        "cca3": "BDI",
+    },
+    {
+        "name": "British Virgin Islands",
+        "area": 151,
+        "cioc": "IVB",
+        "cca2": "VG",
+        "capital": "Road Town",
+        "lat": 18.431383,
+        "lng": -64.62305,
+        "cca3": "VGB",
+    },
+    {
+        "name": "Belarus",
+        "area": 207600,
+        "cioc": "BLR",
+        "cca2": "BY",
+        "capital": "Minsk",
+        "lat": 53,
+        "lng": 28,
+        "cca3": "BLR",
+    },
+    {
+        "name": "Saint Barthelemy",
+        "area": 21,
+        "cioc": "",
+        "cca2": "BL",
+        "capital": "Gustavia",
+        "lat": 18.5,
+        "lng": -63.41666666,
+        "cca3": "BLM",
+    },
+    {
+        "name": "Grenada",
+        "area": 344,
+        "cioc": "GRN",
+        "cca2": "GD",
+        "capital": "St. George's",
+        "lat": 12.11666666,
+        "lng": -61.66666666,
+        "cca3": "GRD",
+    },
+    {
+        "name": "Tokelau",
+        "area": 12,
+        "cioc": "",
+        "cca2": "TK",
+        "capital": "Fakaofo",
+        "lat": -9,
+        "lng": -172,
+        "cca3": "TKL",
+    },
+    {
+        "name": "Greece",
+        "area": 131990,
+        "cioc": "GRE",
+        "cca2": "GR",
+        "capital": "Athens",
+        "lat": 39,
+        "lng": 22,
+        "cca3": "GRC",
+    },
+    {
+        "name": "Russia",
+        "area": 17098242,
+        "cioc": "RUS",
+        "cca2": "RU",
+        "capital": "Moscow",
+        "lat": 60,
+        "lng": 100,
+        "cca3": "RUS",
+    },
+    {
+        "name": "Greenland",
+        "area": 2166086,
+        "cioc": "",
+        "cca2": "GL",
+        "capital": "Nuuk",
+        "lat": 72,
+        "lng": -40,
+        "cca3": "GRL",
+    },
+    {
+        "name": "Andorra",
+        "area": 468,
+        "cioc": "AND",
+        "cca2": "AD",
+        "capital": "Andorra la Vella",
+        "lat": 42.5,
+        "lng": 1.5,
+        "cca3": "AND",
+    },
+    {
+        "name": "Mozambique",
+        "area": 801590,
+        "cioc": "MOZ",
+        "cca2": "MZ",
+        "capital": "Maputo",
+        "lat": -18.25,
+        "lng": 35,
+        "cca3": "MOZ",
+    },
+    {
+        "name": "Tajikistan",
+        "area": 143100,
+        "cioc": "TJK",
+        "cca2": "TJ",
+        "capital": "Dushanbe",
+        "lat": 39,
+        "lng": 71,
+        "cca3": "TJK",
+    },
+    {
+        "name": "Haiti",
+        "area": 27750,
+        "cioc": "HAI",
+        "cca2": "HT",
+        "capital": "Port-au-Prince",
+        "lat": 19,
+        "lng": -72.41666666,
+        "cca3": "HTI",
+    },
+    {
+        "name": "Mexico",
+        "area": 1964375,
+        "cioc": "MEX",
+        "cca2": "MX",
+        "capital": "Mexico City",
+        "lat": 23,
+        "lng": -102,
+        "cca3": "MEX",
+    },
+    {
+        "name": "Zimbabwe",
+        "area": 390757,
+        "cioc": "ZIM",
+        "cca2": "ZW",
+        "capital": "Harare",
+        "lat": -20,
+        "lng": 30,
+        "cca3": "ZWE",
+    },
+    {
+        "name": "Saint Lucia",
+        "area": 616,
+        "cioc": "LCA",
+        "cca2": "LC",
+        "capital": "Castries",
+        "lat": 13.88333333,
+        "lng": -60.96666666,
+        "cca3": "LCA",
+    },
+    {
+        "name": "India",
+        "area": 3287590,
+        "cioc": "IND",
+        "cca2": "IN",
+        "capital": "New Delhi",
+        "lat": 20,
+        "lng": 77,
+        "cca3": "IND",
+    },
+    {
+        "name": "Latvia",
+        "area": 64559,
+        "cioc": "LAT",
+        "cca2": "LV",
+        "capital": "Riga",
+        "lat": 57,
+        "lng": 25,
+        "cca3": "LVA",
+    },
+    {
+        "name": "Bhutan",
+        "area": 38394,
+        "cioc": "BHU",
+        "cca2": "BT",
+        "capital": "Thimphu",
+        "lat": 27.5,
+        "lng": 90.5,
+        "cca3": "BTN",
+    },
+    {
+        "name": "Saint Vincent and the Grenadines",
+        "area": 389,
+        "cioc": "VIN",
+        "cca2": "VC",
+        "capital": "Kingstown",
+        "lat": 13.25,
+        "lng": -61.2,
+        "cca3": "VCT",
+    },
+    {
+        "name": "Vietnam",
+        "area": 331212,
+        "cioc": "VIE",
+        "cca2": "VN",
+        "capital": "Hanoi",
+        "lat": 16.16666666,
+        "lng": 107.83333333,
+        "cca3": "VNM",
+    },
+    {
+        "name": "Norway",
+        "area": 323802,
+        "cioc": "NOR",
+        "cca2": "NO",
+        "capital": "Oslo",
+        "lat": 62,
+        "lng": 10,
+        "cca3": "NOR",
+    },
+    {
+        "name": "Czech Republic",
+        "area": 78865,
+        "cioc": "CZE",
+        "cca2": "CZ",
+        "capital": "Prague",
+        "lat": 49.75,
+        "lng": 15.5,
+        "cca3": "CZE",
+    },
+    {
+        "name": "French Southern and Antarctic Lands",
+        "area": 7747,
+        "cioc": "",
+        "cca2": "TF",
+        "capital": "Port-aux-Francais",
+        "lat": -49.25,
+        "lng": 69.167,
+        "cca3": "ATF",
+    },
+    {
+        "name": "Antigua and Barbuda",
+        "area": 442,
+        "cioc": "ANT",
+        "cca2": "AG",
+        "capital": "Saint John's",
+        "lat": 17.05,
+        "lng": -61.8,
+        "cca3": "ATG",
+    },
+    {
+        "name": "Fiji",
+        "area": 18272,
+        "cioc": "FIJ",
+        "cca2": "FJ",
+        "capital": "Suva",
+        "lat": -18,
+        "lng": 175,
+        "cca3": "FJI",
+    },
+    {
+        "name": "British Indian Ocean Territory",
+        "area": 60,
+        "cioc": "",
+        "cca2": "IO",
+        "capital": "Diego Garcia",
+        "lat": -6,
+        "lng": 71.5,
+        "cca3": "IOT",
+    },
+    {
+        "name": "Honduras",
+        "area": 112492,
+        "cioc": "HON",
+        "cca2": "HN",
+        "capital": "Tegucigalpa",
+        "lat": 15,
+        "lng": -86.5,
+        "cca3": "HND",
+    },
+    {
+        "name": "Mauritius",
+        "area": 2040,
+        "cioc": "MRI",
+        "cca2": "MU",
+        "capital": "Port Louis",
+        "lat": -20.28333333,
+        "lng": 57.55,
+        "cca3": "MUS",
+    },
+    {
+        "name": "Antarctica",
+        "area": 14000000,
+        "cioc": "",
+        "cca2": "AQ",
+        "capital": "",
+        "lat": -90,
+        "lng": 0,
+        "cca3": "ATA",
+    },
+    {
+        "name": "Luxembourg",
+        "area": 2586,
+        "cioc": "LUX",
+        "cca2": "LU",
+        "capital": "Luxembourg",
+        "lat": 49.75,
+        "lng": 6.16666666,
+        "cca3": "LUX",
+    },
+    {
+        "name": "Israel",
+        "area": 20770,
+        "cioc": "ISR",
+        "cca2": "IL",
+        "capital": "Jerusalem",
+        "lat": 31.47,
+        "lng": 35.13,
+        "cca3": "ISR",
+    },
+    {
+        "name": "Micronesia",
+        "area": 702,
+        "cioc": "FSM",
+        "cca2": "FM",
+        "capital": "Palikir",
+        "lat": 6.91666666,
+        "lng": 158.25,
+        "cca3": "FSM",
+    },
+    {
+        "name": "Peru",
+        "area": 1285216,
+        "cioc": "PER",
+        "cca2": "PE",
+        "capital": "Lima",
+        "lat": -10,
+        "lng": -76,
+        "cca3": "PER",
+    },
+    {
+        "name": "Reunion",
+        "area": 2511,
+        "cioc": "",
+        "cca2": "RE",
+        "capital": "Saint-Denis",
+        "lat": -21.15,
+        "lng": 55.5,
+        "cca3": "REU",
+    },
+    {
+        "name": "Indonesia",
+        "area": 1904569,
+        "cioc": "INA",
+        "cca2": "ID",
+        "capital": "Jakarta",
+        "lat": -5,
+        "lng": 120,
+        "cca3": "IDN",
+    },
+    {
+        "name": "Vanuatu",
+        "area": 12189,
+        "cioc": "VAN",
+        "cca2": "VU",
+        "capital": "Port Vila",
+        "lat": -16,
+        "lng": 167,
+        "cca3": "VUT",
+    },
+    {
+        "name": "Macedonia",
+        "area": 25713,
+        "cioc": "MKD",
+        "cca2": "MK",
+        "capital": "Skopje",
+        "lat": 41.83333333,
+        "lng": 22,
+        "cca3": "MKD",
+    },
+    {
+        "name": "DR Congo",
+        "area": 2344858,
+        "cioc": "COD",
+        "cca2": "CD",
+        "capital": "Kinshasa",
+        "lat": 0,
+        "lng": 25,
+        "cca3": "COD",
+    },
+    {
+        "name": "Republic of the Congo",
+        "area": 342000,
+        "cioc": "CGO",
+        "cca2": "CG",
+        "capital": "Brazzaville",
+        "lat": -1,
+        "lng": 15,
+        "cca3": "COG",
+    },
+    {
+        "name": "Iceland",
+        "area": 103000,
+        "cioc": "ISL",
+        "cca2": "IS",
+        "capital": "Reykjavik",
+        "lat": 65,
+        "lng": -18,
+        "cca3": "ISL",
+    },
+    {
+        "name": "Guadeloupe",
+        "area": 1628,
+        "cioc": "",
+        "cca2": "GP",
+        "capital": "Basse-Terre",
+        "lat": 16.25,
+        "lng": -61.583333,
+        "cca3": "GLP",
+    },
+    {
+        "name": "Cook Islands",
+        "area": 236,
+        "cioc": "COK",
+        "cca2": "CK",
+        "capital": "Avarua",
+        "lat": -21.23333333,
+        "lng": -159.76666666,
+        "cca3": "COK",
+    },
+    {
+        "name": "Comoros",
+        "area": 1862,
+        "cioc": "COM",
+        "cca2": "KM",
+        "capital": "Moroni",
+        "lat": -12.16666666,
+        "lng": 44.25,
+        "cca3": "COM",
+    },
+    {
+        "name": "Colombia",
+        "area": 1141748,
+        "cioc": "COL",
+        "cca2": "CO",
+        "capital": "Bogota",
+        "lat": 4,
+        "lng": -72,
+        "cca3": "COL",
+    },
+    {
+        "name": "Nigeria",
+        "area": 923768,
+        "cioc": "NGR",
+        "cca2": "NG",
+        "capital": "Abuja",
+        "lat": 10,
+        "lng": 8,
+        "cca3": "NGA",
+    },
+    {
+        "name": "Timor-Leste",
+        "area": 14874,
+        "cioc": "TLS",
+        "cca2": "TL",
+        "capital": "Dili",
+        "lat": -8.83333333,
+        "lng": 125.91666666,
+        "cca3": "TLS",
+    },
+    {
+        "name": "Taiwan",
+        "area": 36193,
+        "cioc": "TPE",
+        "cca2": "TW",
+        "capital": "Taipei",
+        "lat": 23.5,
+        "lng": 121,
+        "cca3": "TWN",
+    },
+    {
+        "name": "Portugal",
+        "area": 92090,
+        "cioc": "POR",
+        "cca2": "PT",
+        "capital": "Lisbon",
+        "lat": 39.5,
+        "lng": -8,
+        "cca3": "PRT",
+    },
+    {
+        "name": "Moldova",
+        "area": 33846,
+        "cioc": "MDA",
+        "cca2": "MD",
+        "capital": "Chisinau",
+        "lat": 47,
+        "lng": 29,
+        "cca3": "MDA",
+    },
+    {
+        "name": "Guernsey",
+        "area": 78,
+        "cioc": "",
+        "cca2": "GG",
+        "capital": "St. Peter Port",
+        "lat": 49.46666666,
+        "lng": -2.58333333,
+        "cca3": "GGY",
+    },
+    {
+        "name": "Madagascar",
+        "area": 587041,
+        "cioc": "MAD",
+        "cca2": "MG",
+        "capital": "Antananarivo",
+        "lat": -20,
+        "lng": 47,
+        "cca3": "MDG",
+    },
+    {
+        "name": "Ecuador",
+        "area": 276841,
+        "cioc": "ECU",
+        "cca2": "EC",
+        "capital": "Quito",
+        "lat": -2,
+        "lng": -77.5,
+        "cca3": "ECU",
+    },
+    {
+        "name": "Senegal",
+        "area": 196722,
+        "cioc": "SEN",
+        "cca2": "SN",
+        "capital": "Dakar",
+        "lat": 14,
+        "lng": -14,
+        "cca3": "SEN",
+    },
+    {
+        "name": "New Zealand",
+        "area": 270467,
+        "cioc": "NZL",
+        "cca2": "NZ",
+        "capital": "Wellington",
+        "lat": -41,
+        "lng": 174,
+        "cca3": "NZL",
+    },
+    {
+        "name": "Maldives",
+        "area": 300,
+        "cioc": "MDV",
+        "cca2": "MV",
+        "capital": "Male",
+        "lat": 3.25,
+        "lng": 73,
+        "cca3": "MDV",
+    },
+    {
+        "name": "American Samoa",
+        "area": 199,
+        "cioc": "ASA",
+        "cca2": "AS",
+        "capital": "Pago Pago",
+        "lat": -14.33333333,
+        "lng": -170,
+        "cca3": "ASM",
+    },
+    {
+        "name": "Saint Pierre and Miquelon",
+        "area": 242,
+        "cioc": "",
+        "cca2": "PM",
+        "capital": "Saint-Pierre",
+        "lat": 46.83333333,
+        "lng": -56.33333333,
+        "cca3": "SPM",
+    },
+    {
+        "name": "Curacao",
+        "area": 444,
+        "cioc": "",
+        "cca2": "CW",
+        "capital": "Willemstad",
+        "lat": 12.116667,
+        "lng": -68.933333,
+        "cca3": "CUW",
+    },
+    {
+        "name": "France",
+        "area": 551695,
+        "cioc": "FRA",
+        "cca2": "FR",
+        "capital": "Paris",
+        "lat": 46,
+        "lng": 2,
+        "cca3": "FRA",
+    },
+    {
+        "name": "Lithuania",
+        "area": 65300,
+        "cioc": "LTU",
+        "cca2": "LT",
+        "capital": "Vilnius",
+        "lat": 56,
+        "lng": 24,
+        "cca3": "LTU",
+    },
+    {
+        "name": "Rwanda",
+        "area": 26338,
+        "cioc": "RWA",
+        "cca2": "RW",
+        "capital": "Kigali",
+        "lat": -2,
+        "lng": 30,
+        "cca3": "RWA",
+    },
+    {
+        "name": "Zambia",
+        "area": 752612,
+        "cioc": "ZAM",
+        "cca2": "ZM",
+        "capital": "Lusaka",
+        "lat": -15,
+        "lng": 30,
+        "cca3": "ZMB",
+    },
+    {
+        "name": "Gambia",
+        "area": 10689,
+        "cioc": "GAM",
+        "cca2": "GM",
+        "capital": "Banjul",
+        "lat": 13.46666666,
+        "lng": -16.56666666,
+        "cca3": "GMB",
+    },
+    {
+        "name": "Wallis and Futuna",
+        "area": 142,
+        "cioc": "",
+        "cca2": "WF",
+        "capital": "Mata-Utu",
+        "lat": -13.3,
+        "lng": -176.2,
+        "cca3": "WLF",
+    },
+    {
+        "name": "Jersey",
+        "area": 116,
+        "cioc": "",
+        "cca2": "JE",
+        "capital": "Saint Helier",
+        "lat": 49.25,
+        "lng": -2.16666666,
+        "cca3": "JEY",
+    },
+    {
+        "name": "Faroe Islands",
+        "area": 1393,
+        "cioc": "",
+        "cca2": "FO",
+        "capital": "Torshavn",
+        "lat": 62,
+        "lng": -7,
+        "cca3": "FRO",
+    },
+    {
+        "name": "Guatemala",
+        "area": 108889,
+        "cioc": "GUA",
+        "cca2": "GT",
+        "capital": "Guatemala City",
+        "lat": 15.5,
+        "lng": -90.25,
+        "cca3": "GTM",
+    },
+    {
+        "name": "Denmark",
+        "area": 43094,
+        "cioc": "DEN",
+        "cca2": "DK",
+        "capital": "Copenhagen",
+        "lat": 56,
+        "lng": 10,
+        "cca3": "DNK",
+    },
+    {
+        "name": "Isle of Man",
+        "area": 572,
+        "cioc": "",
+        "cca2": "IM",
+        "capital": "Douglas",
+        "lat": 54.25,
+        "lng": -4.5,
+        "cca3": "IMN",
+    },
+    {
+        "name": "Australia",
+        "area": 7692024,
+        "cioc": "AUS",
+        "cca2": "AU",
+        "capital": "Canberra",
+        "lat": -27,
+        "lng": 133,
+        "cca3": "AUS",
+    },
+    {
+        "name": "Austria",
+        "area": 83871,
+        "cioc": "AUT",
+        "cca2": "AT",
+        "capital": "Vienna",
+        "lat": 47.33333333,
+        "lng": 13.33333333,
+        "cca3": "AUT",
+    },
+    {
+        "name": "Svalbard and Jan Mayen",
+        "area": -1,
+        "cioc": "",
+        "cca2": "SJ",
+        "capital": "Longyearbyen",
+        "lat": 78,
+        "lng": 20,
+        "cca3": "SJM",
+    },
+    {
+        "name": "Venezuela",
+        "area": 916445,
+        "cioc": "VEN",
+        "cca2": "VE",
+        "capital": "Caracas",
+        "lat": 8,
+        "lng": -66,
+        "cca3": "VEN",
+    },
+    {
+        "name": "Kosovo",
+        "area": 10908,
+        "cioc": "KOS",
+        "cca2": "XK",
+        "capital": "Pristina",
+        "lat": 42.666667,
+        "lng": 21.166667,
+        "cca3": "UNK",
+    },
+    {
+        "name": "Palau",
+        "area": 459,
+        "cioc": "PLW",
+        "cca2": "PW",
+        "capital": "Ngerulmud",
+        "lat": 7.5,
+        "lng": 134.5,
+        "cca3": "PLW",
+    },
+    {
+        "name": "Kenya",
+        "area": 580367,
+        "cioc": "KEN",
+        "cca2": "KE",
+        "capital": "Nairobi",
+        "lat": 1,
+        "lng": 38,
+        "cca3": "KEN",
+    },
+    {
+        "name": "Samoa",
+        "area": 2842,
+        "cioc": "SAM",
+        "cca2": "WS",
+        "capital": "Apia",
+        "lat": -13.58333333,
+        "lng": -172.33333333,
+        "cca3": "WSM",
+    },
+    {
+        "name": "Turkey",
+        "area": 783562,
+        "cioc": "TUR",
+        "cca2": "TR",
+        "capital": "Ankara",
+        "lat": 39,
+        "lng": 35,
+        "cca3": "TUR",
+    },
+    {
+        "name": "Albania",
+        "area": 28748,
+        "cioc": "ALB",
+        "cca2": "AL",
+        "capital": "Tirana",
+        "lat": 41,
+        "lng": 20,
+        "cca3": "ALB",
+    },
+    {
+        "name": "Oman",
+        "area": 309500,
+        "cioc": "OMA",
+        "cca2": "OM",
+        "capital": "Muscat",
+        "lat": 21,
+        "lng": 57,
+        "cca3": "OMN",
+    },
+    {
+        "name": "Tuvalu",
+        "area": 26,
+        "cioc": "TUV",
+        "cca2": "TV",
+        "capital": "Funafuti",
+        "lat": -8,
+        "lng": 178,
+        "cca3": "TUV",
+    },
+    {
+        "name": "Aland Islands",
+        "area": 1580,
+        "cioc": "",
+        "cca2": "AX",
+        "capital": "Mariehamn",
+        "lat": 60.116667,
+        "lng": 19.9,
+        "cca3": "ALA",
+    },
+    {
+        "name": "Brunei",
+        "area": 5765,
+        "cioc": "BRU",
+        "cca2": "BN",
+        "capital": "Bandar Seri Begawan",
+        "lat": 4.5,
+        "lng": 114.66666666,
+        "cca3": "BRN",
+    },
+    {
+        "name": "Tunisia",
+        "area": 163610,
+        "cioc": "TUN",
+        "cca2": "TN",
+        "capital": "Tunis",
+        "lat": 34,
+        "lng": 9,
+        "cca3": "TUN",
+    },
+    {
+        "name": "Pitcairn Islands",
+        "area": 47,
+        "cioc": "",
+        "cca2": "PN",
+        "capital": "Adamstown",
+        "lat": -25.06666666,
+        "lng": -130.1,
+        "cca3": "PCN",
+    },
+    {
+        "name": "Barbados",
+        "area": 430,
+        "cioc": "BAR",
+        "cca2": "BB",
+        "capital": "Bridgetown",
+        "lat": 13.16666666,
+        "lng": -59.53333333,
+        "cca3": "BRB",
+    },
+    {
+        "name": "Brazil",
+        "area": 8515767,
+        "cioc": "BRA",
+        "cca2": "BR",
+        "capital": "Brasilia",
+        "lat": -10,
+        "lng": -55,
+        "cca3": "BRA",
+    },
+    {
+        "name": "Ivory Coast",
+        "area": 322463,
+        "cioc": "CIV",
+        "cca2": "CI",
+        "capital": "Yamoussoukro",
+        "lat": 8,
+        "lng": -5,
+        "cca3": "CIV",
+    },
+    {
+        "name": "Serbia",
+        "area": 88361,
+        "cioc": "SRB",
+        "cca2": "RS",
+        "capital": "Belgrade",
+        "lat": 44,
+        "lng": 21,
+        "cca3": "SRB",
+    },
+    {
+        "name": "Equatorial Guinea",
+        "area": 28051,
+        "cioc": "GEQ",
+        "cca2": "GQ",
+        "capital": "Malabo",
+        "lat": 2,
+        "lng": 10,
+        "cca3": "GNQ",
+    },
+    {
+        "name": "United States",
+        "area": 9372610,
+        "cioc": "USA",
+        "cca2": "US",
+        "capital": "Washington D.C.",
+        "lat": 38,
+        "lng": -97,
+        "cca3": "USA",
+    },
+    {
+        "name": "Qatar",
+        "area": 11586,
+        "cioc": "QAT",
+        "cca2": "QA",
+        "capital": "Doha",
+        "lat": 25.5,
+        "lng": 51.25,
+        "cca3": "QAT",
+    },
+    {
+        "name": "Sweden",
+        "area": 450295,
+        "cioc": "SWE",
+        "cca2": "SE",
+        "capital": "Stockholm",
+        "lat": 62,
+        "lng": 15,
+        "cca3": "SWE",
+    },
+    {
+        "name": "Azerbaijan",
+        "area": 86600,
+        "cioc": "AZE",
+        "cca2": "AZ",
+        "capital": "Baku",
+        "lat": 40.5,
+        "lng": 47.5,
+        "cca3": "AZE",
+    },
+    {
+        "name": "Guinea-Bissau",
+        "area": 36125,
+        "cioc": "GBS",
+        "cca2": "GW",
+        "capital": "Bissau",
+        "lat": 12,
+        "lng": -15,
+        "cca3": "GNB",
+    },
+    {
+        "name": "Swaziland",
+        "area": 17364,
+        "cioc": "SWZ",
+        "cca2": "SZ",
+        "capital": "Lobamba",
+        "lat": -26.5,
+        "lng": 31.5,
+        "cca3": "SWZ",
+    },
+    {
+        "name": "Tonga",
+        "area": 747,
+        "cioc": "TGA",
+        "cca2": "TO",
+        "capital": "Nuku'alofa",
+        "lat": -20,
+        "lng": -175,
+        "cca3": "TON",
+    },
+    {
+        "name": "Canada",
+        "area": 9984670,
+        "cioc": "CAN",
+        "cca2": "CA",
+        "capital": "Ottawa",
+        "lat": 60,
+        "lng": -95,
+        "cca3": "CAN",
+    },
+    {
+        "name": "Ukraine",
+        "area": 603500,
+        "cioc": "UKR",
+        "cca2": "UA",
+        "capital": "Kiev",
+        "lat": 49,
+        "lng": 32,
+        "cca3": "UKR",
+    },
+    {
+        "name": "South Korea",
+        "area": 100210,
+        "cioc": "KOR",
+        "cca2": "KR",
+        "capital": "Seoul",
+        "lat": 37,
+        "lng": 127.5,
+        "cca3": "KOR",
+    },
+    {
+        "name": "Anguilla",
+        "area": 91,
+        "cioc": "",
+        "cca2": "AI",
+        "capital": "The Valley",
+        "lat": 18.25,
+        "lng": -63.16666666,
+        "cca3": "AIA",
+    },
+    {
+        "name": "Central African Republic",
+        "area": 622984,
+        "cioc": "CAF",
+        "cca2": "CF",
+        "capital": "Bangui",
+        "lat": 7,
+        "lng": 21,
+        "cca3": "CAF",
+    },
+    {
+        "name": "Slovakia",
+        "area": 49037,
+        "cioc": "SVK",
+        "cca2": "SK",
+        "capital": "Bratislava",
+        "lat": 48.66666666,
+        "lng": 19.5,
+        "cca3": "SVK",
+    },
+    {
+        "name": "Cyprus",
+        "area": 9251,
+        "cioc": "CYP",
+        "cca2": "CY",
+        "capital": "Nicosia",
+        "lat": 35,
+        "lng": 33,
+        "cca3": "CYP",
+    },
+    {
+        "name": "Bosnia and Herzegovina",
+        "area": 51209,
+        "cioc": "BIH",
+        "cca2": "BA",
+        "capital": "Sarajevo",
+        "lat": 44,
+        "lng": 18,
+        "cca3": "BIH",
+    },
+    {
+        "name": "Singapore",
+        "area": 710,
+        "cioc": "SIN",
+        "cca2": "SG",
+        "capital": "Singapore",
+        "lat": 1.36666666,
+        "lng": 103.8,
+        "cca3": "SGP",
+    },
+    {
+        "name": "South Georgia",
+        "area": 3903,
+        "cioc": "",
+        "cca2": "GS",
+        "capital": "King Edward Point",
+        "lat": -54.5,
+        "lng": -37,
+        "cca3": "SGS",
+    },
+    {
+        "name": "Somalia",
+        "area": 637657,
+        "cioc": "SOM",
+        "cca2": "SO",
+        "capital": "Mogadishu",
+        "lat": 10,
+        "lng": 49,
+        "cca3": "SOM",
+    },
+    {
+        "name": "Uzbekistan",
+        "area": 447400,
+        "cioc": "UZB",
+        "cca2": "UZ",
+        "capital": "Tashkent",
+        "lat": 41,
+        "lng": 64,
+        "cca3": "UZB",
+    },
+    {
+        "name": "Eritrea",
+        "area": 117600,
+        "cioc": "ERI",
+        "cca2": "ER",
+        "capital": "Asmara",
+        "lat": 15,
+        "lng": 39,
+        "cca3": "ERI",
+    },
+    {
+        "name": "Poland",
+        "area": 312679,
+        "cioc": "POL",
+        "cca2": "PL",
+        "capital": "Warsaw",
+        "lat": 52,
+        "lng": 20,
+        "cca3": "POL",
+    },
+    {
+        "name": "Kuwait",
+        "area": 17818,
+        "cioc": "KUW",
+        "cca2": "KW",
+        "capital": "Kuwait City",
+        "lat": 29.5,
+        "lng": 45.75,
+        "cca3": "KWT",
+    },
+    {
+        "name": "Gabon",
+        "area": 267668,
+        "cioc": "GAB",
+        "cca2": "GA",
+        "capital": "Libreville",
+        "lat": -1,
+        "lng": 11.75,
+        "cca3": "GAB",
+    },
+    {
+        "name": "Cayman Islands",
+        "area": 264,
+        "cioc": "CAY",
+        "cca2": "KY",
+        "capital": "George Town",
+        "lat": 19.5,
+        "lng": -80.5,
+        "cca3": "CYM",
+    },
+    {
+        "name": "Vatican City",
+        "area": 0.44,
+        "cioc": "",
+        "cca2": "VA",
+        "capital": "Vatican City",
+        "lat": 41.9,
+        "lng": 12.45,
+        "cca3": "VAT",
+    },
+    {
+        "name": "Estonia",
+        "area": 45227,
+        "cioc": "EST",
+        "cca2": "EE",
+        "capital": "Tallinn",
+        "lat": 59,
+        "lng": 26,
+        "cca3": "EST",
+    },
+    {
+        "name": "Malawi",
+        "area": 118484,
+        "cioc": "MAW",
+        "cca2": "MW",
+        "capital": "Lilongwe",
+        "lat": -13.5,
+        "lng": 34,
+        "cca3": "MWI",
+    },
+    {
+        "name": "Spain",
+        "area": 505992,
+        "cioc": "ESP",
+        "cca2": "ES",
+        "capital": "Madrid",
+        "lat": 40,
+        "lng": -4,
+        "cca3": "ESP",
+    },
+    {
+        "name": "Iraq",
+        "area": 438317,
+        "cioc": "IRQ",
+        "cca2": "IQ",
+        "capital": "Baghdad",
+        "lat": 33,
+        "lng": 44,
+        "cca3": "IRQ",
+    },
+    {
+        "name": "El Salvador",
+        "area": 21041,
+        "cioc": "ESA",
+        "cca2": "SV",
+        "capital": "San Salvador",
+        "lat": 13.83333333,
+        "lng": -88.91666666,
+        "cca3": "SLV",
+    },
+    {
+        "name": "Mali",
+        "area": 1240192,
+        "cioc": "MLI",
+        "cca2": "ML",
+        "capital": "Bamako",
+        "lat": 17,
+        "lng": -4,
+        "cca3": "MLI",
+    },
+    {
+        "name": "Ireland",
+        "area": 70273,
+        "cioc": "IRL",
+        "cca2": "IE",
+        "capital": "Dublin",
+        "lat": 53,
+        "lng": -8,
+        "cca3": "IRL",
+    },
+    {
+        "name": "Iran",
+        "area": 1648195,
+        "cioc": "IRI",
+        "cca2": "IR",
+        "capital": "Tehran",
+        "lat": 32,
+        "lng": 53,
+        "cca3": "IRN",
+    },
+    {
+        "name": "Aruba",
+        "area": 180,
+        "cioc": "ARU",
+        "cca2": "AW",
+        "capital": "Oranjestad",
+        "lat": 12.5,
+        "lng": -69.96666666,
+        "cca3": "ABW",
+    },
+    {
+        "name": "Papua New Guinea",
+        "area": 462840,
+        "cioc": "PNG",
+        "cca2": "PG",
+        "capital": "Port Moresby",
+        "lat": -6,
+        "lng": 147,
+        "cca3": "PNG",
+    },
+    {
+        "name": "Panama",
+        "area": 75417,
+        "cioc": "PAN",
+        "cca2": "PA",
+        "capital": "Panama City",
+        "lat": 9,
+        "lng": -80,
+        "cca3": "PAN",
+    },
+    {
+        "name": "Sudan",
+        "area": 1886068,
+        "cioc": "SUD",
+        "cca2": "SD",
+        "capital": "Khartoum",
+        "lat": 15,
+        "lng": 30,
+        "cca3": "SDN",
+    },
+    {
+        "name": "Solomon Islands",
+        "area": 28896,
+        "cioc": "SOL",
+        "cca2": "SB",
+        "capital": "Honiara",
+        "lat": -8,
+        "lng": 159,
+        "cca3": "SLB",
+    },
+    {
+        "name": "Western Sahara",
+        "area": 266000,
+        "cioc": "",
+        "cca2": "EH",
+        "capital": "El Aaiun",
+        "lat": 24.5,
+        "lng": -13,
+        "cca3": "ESH",
+    },
+    {
+        "name": "Monaco",
+        "area": 2.02,
+        "cioc": "MON",
+        "cca2": "MC",
+        "capital": "Monaco",
+        "lat": 43.73333333,
+        "lng": 7.4,
+        "cca3": "MCO",
+    },
+    {
+        "name": "Italy",
+        "area": 301336,
+        "cioc": "ITA",
+        "cca2": "IT",
+        "capital": "Rome",
+        "lat": 42.83333333,
+        "lng": 12.83333333,
+        "cca3": "ITA",
+    },
+    {
+        "name": "Japan",
+        "area": 377930,
+        "cioc": "JPN",
+        "cca2": "JP",
+        "capital": "Tokyo",
+        "lat": 36,
+        "lng": 138,
+        "cca3": "JPN",
+    },
+    {
+        "name": "Kyrgyzstan",
+        "area": 199951,
+        "cioc": "KGZ",
+        "cca2": "KG",
+        "capital": "Bishkek",
+        "lat": 41,
+        "lng": 75,
+        "cca3": "KGZ",
+    },
+    {
+        "name": "Uganda",
+        "area": 241550,
+        "cioc": "UGA",
+        "cca2": "UG",
+        "capital": "Kampala",
+        "lat": 1,
+        "lng": 32,
+        "cca3": "UGA",
+    },
+    {
+        "name": "New Caledonia",
+        "area": 18575,
+        "cioc": "",
+        "cca2": "NC",
+        "capital": "Noumea",
+        "lat": -21.5,
+        "lng": 165.5,
+        "cca3": "NCL",
+    },
+    {
+        "name": "United Arab Emirates",
+        "area": 83600,
+        "cioc": "UAE",
+        "cca2": "AE",
+        "capital": "Abu Dhabi",
+        "lat": 24,
+        "lng": 54,
+        "cca3": "ARE",
+    },
+    {
+        "name": "Argentina",
+        "area": 2780400,
+        "cioc": "ARG",
+        "cca2": "AR",
+        "capital": "Buenos Aires",
+        "lat": -34,
+        "lng": -64,
+        "cca3": "ARG",
+    },
+    {
+        "name": "Bahamas",
+        "area": 13943,
+        "cioc": "BAH",
+        "cca2": "BS",
+        "capital": "Nassau",
+        "lat": 24.25,
+        "lng": -76,
+        "cca3": "BHS",
+    },
+    {
+        "name": "Bahrain",
+        "area": 765,
+        "cioc": "BRN",
+        "cca2": "BH",
+        "capital": "Manama",
+        "lat": 26,
+        "lng": 50.55,
+        "cca3": "BHR",
+    },
+    {
+        "name": "Armenia",
+        "area": 29743,
+        "cioc": "ARM",
+        "cca2": "AM",
+        "capital": "Yerevan",
+        "lat": 40,
+        "lng": 45,
+        "cca3": "ARM",
+    },
+    {
+        "name": "Nauru",
+        "area": 21,
+        "cioc": "NRU",
+        "cca2": "NR",
+        "capital": "Yaren",
+        "lat": -0.53333333,
+        "lng": 166.91666666,
+        "cca3": "NRU",
+    },
+    {
+        "name": "Cuba",
+        "area": 109884,
+        "cioc": "CUB",
+        "cca2": "CU",
+        "capital": "Havana",
+        "lat": 21.5,
+        "lng": -80,
+        "cca3": "CUB",
+    },
+]
+
+all_lookups: Dict[str, Dict[str, Dict[str, Any]]] = {}
+lookups = ["cioc", "cca2", "cca3", "name"]
+for lookup in lookups:
+    all_lookups[lookup] = {}
+    for country in countries:
+        all_lookups[lookup][country[lookup].lower()] = country
+
+
+def get(field: str, symbol: str) -> Optional[Dict[str, Any]]:
+    """
+    Get country data based on a standard code and a symbol
+    """
+    return all_lookups[field].get(symbol.lower())

+ 114 - 0
BI/examples/country_map.py

@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import datetime
+
+import pandas as pd
+from sqlalchemy import BigInteger, Date, String
+from sqlalchemy.sql import column
+
+from superset import db
+from superset.connectors.sqla.models import SqlMetric
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+)
+
+
+def load_country_map_data(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading data for map with country map"""
+    tbl_name = "birth_france_by_region"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        csv_bytes = get_example_data(
+            "birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True
+        )
+        data = pd.read_csv(csv_bytes, encoding="utf-8")
+        data["dttm"] = datetime.datetime.now().date()
+        data.to_sql(  # pylint: disable=no-member
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "DEPT_ID": String(10),
+                "2003": BigInteger,
+                "2004": BigInteger,
+                "2005": BigInteger,
+                "2006": BigInteger,
+                "2007": BigInteger,
+                "2008": BigInteger,
+                "2009": BigInteger,
+                "2010": BigInteger,
+                "2011": BigInteger,
+                "2012": BigInteger,
+                "2013": BigInteger,
+                "2014": BigInteger,
+                "dttm": Date(),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print("Creating table reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "dttm"
+    obj.database = database
+    if not any(col.metric_name == "avg__2004" for col in obj.metrics):
+        col = str(column("2004").compile(db.engine))
+        obj.metrics.append(SqlMetric(metric_name="avg__2004", expression=f"AVG({col})"))
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "",
+        "since": "",
+        "until": "",
+        "viz_type": "country_map",
+        "entity": "DEPT_ID",
+        "metric": {
+            "expressionType": "SIMPLE",
+            "column": {"type": "INT", "column_name": "2004"},
+            "aggregate": "AVG",
+            "label": "Boys",
+            "optionName": "metric_112342",
+        },
+        "row_limit": 500000,
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Birth in France by department in 2016",
+        viz_type="country_map",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 100 - 0
BI/examples/css_templates.py

@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import textwrap
+
+from superset import db
+from superset.models.core import CssTemplate
+
+
+def load_css_templates() -> None:
+    """Loads 2 css templates to demonstrate the feature"""
+    print("Creating default CSS templates")
+
+    obj = db.session.query(CssTemplate).filter_by(template_name="Flat").first()
+    if not obj:
+        obj = CssTemplate(template_name="Flat")
+    css = textwrap.dedent(
+        """\
+    .navbar {
+        transition: opacity 0.5s ease;
+        opacity: 0.05;
+    }
+    .navbar:hover {
+        opacity: 1;
+    }
+    .chart-header .header{
+        font-weight: @font-weight-normal;
+        font-size: 12px;
+    }
+    /*
+    var bnbColors = [
+        //rausch    hackb      kazan      babu      lima        beach     tirol
+        '#ff5a5f', '#7b0051', '#007A87', '#00d1c1', '#8ce071', '#ffb400', '#b4a76c',
+        '#ff8083', '#cc0086', '#00a1b3', '#00ffeb', '#bbedab', '#ffd266', '#cbc29a',
+        '#ff3339', '#ff1ab1', '#005c66', '#00b3a5', '#55d12e', '#b37e00', '#988b4e',
+     ];
+    */
+    """
+    )
+    obj.css = css
+    db.session.merge(obj)
+    db.session.commit()
+
+    obj = db.session.query(CssTemplate).filter_by(template_name="Courier Black").first()
+    if not obj:
+        obj = CssTemplate(template_name="Courier Black")
+    css = textwrap.dedent(
+        """\
+    h2 {
+        color: white;
+        font-size: 52px;
+    }
+    .navbar {
+        box-shadow: none;
+    }
+    .navbar {
+        transition: opacity 0.5s ease;
+        opacity: 0.05;
+    }
+    .navbar:hover {
+        opacity: 1;
+    }
+    .chart-header .header{
+        font-weight: @font-weight-normal;
+        font-size: 12px;
+    }
+    .nvd3 text {
+        font-size: 12px;
+        font-family: inherit;
+    }
+    body{
+        background: #000;
+        font-family: Courier, Monaco, monospace;;
+    }
+    /*
+    var bnbColors = [
+        //rausch    hackb      kazan      babu      lima        beach     tirol
+        '#ff5a5f', '#7b0051', '#007A87', '#00d1c1', '#8ce071', '#ffb400', '#b4a76c',
+        '#ff8083', '#cc0086', '#00a1b3', '#00ffeb', '#bbedab', '#ffd266', '#cbc29a',
+        '#ff3339', '#ff1ab1', '#005c66', '#00b3a5', '#55d12e', '#b37e00', '#988b4e',
+     ];
+    */
+    """
+    )
+    obj.css = css
+    db.session.merge(obj)
+    db.session.commit()

+ 529 - 0
BI/examples/deck.py

@@ -0,0 +1,529 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-statements
+import json
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import get_slice_json, merge_slice, TBL, update_slice_ids
+
+COLOR_RED = {"r": 205, "g": 0, "b": 3, "a": 0.82}
+POSITION_JSON = """\
+{
+    "CHART-3afd9d70": {
+        "meta": {
+            "chartId": 66,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-3afd9d70",
+        "children": []
+    },
+    "CHART-2ee7fa5e": {
+        "meta": {
+            "chartId": 67,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-2ee7fa5e",
+        "children": []
+    },
+    "CHART-201f7715": {
+        "meta": {
+            "chartId": 68,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-201f7715",
+        "children": []
+    },
+    "CHART-d02f6c40": {
+        "meta": {
+            "chartId": 69,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-d02f6c40",
+        "children": []
+    },
+    "CHART-2673431d": {
+        "meta": {
+            "chartId": 70,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-2673431d",
+        "children": []
+    },
+    "CHART-85265a60": {
+        "meta": {
+            "chartId": 71,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-85265a60",
+        "children": []
+    },
+    "CHART-2b87513c": {
+        "meta": {
+            "chartId": 72,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-2b87513c",
+        "children": []
+    },
+    "GRID_ID": {
+        "type": "GRID",
+        "id": "GRID_ID",
+        "children": [
+            "ROW-a7b16cb5",
+            "ROW-72c218a5",
+            "ROW-957ba55b",
+            "ROW-af041bdd"
+        ]
+    },
+    "HEADER_ID": {
+        "meta": {
+            "text": "deck.gl Demo"
+        },
+        "type": "HEADER",
+        "id": "HEADER_ID"
+    },
+    "ROOT_ID": {
+        "type": "ROOT",
+        "id": "ROOT_ID",
+        "children": [
+            "GRID_ID"
+        ]
+    },
+    "ROW-72c218a5": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-72c218a5",
+        "children": [
+            "CHART-d02f6c40",
+            "CHART-201f7715"
+        ]
+    },
+    "ROW-957ba55b": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-957ba55b",
+        "children": [
+            "CHART-2673431d",
+            "CHART-85265a60"
+        ]
+    },
+    "ROW-a7b16cb5": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-a7b16cb5",
+        "children": [
+            "CHART-3afd9d70",
+            "CHART-2ee7fa5e"
+        ]
+    },
+    "ROW-af041bdd": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-af041bdd",
+        "children": [
+            "CHART-2b87513c"
+        ]
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}"""
+
+
+def load_deck_dash() -> None:
+    print("Loading deck.gl dashboard")
+    slices = []
+    tbl = db.session.query(TBL).filter_by(table_name="long_lat").first()
+    slice_data = {
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "color_picker": COLOR_RED,
+        "datasource": "5__table",
+        "granularity_sqla": None,
+        "groupby": [],
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "multiplier": 10,
+        "point_radius_fixed": {"type": "metric", "value": "count"},
+        "point_unit": "square_m",
+        "min_radius": 1,
+        "max_radius": 250,
+        "row_limit": 5000,
+        "time_range": " : ",
+        "size": "count",
+        "time_grain_sqla": None,
+        "viewport": {
+            "bearing": -4.952916738791771,
+            "latitude": 37.78926922909199,
+            "longitude": -122.42613341901688,
+            "pitch": 4.750411100577438,
+            "zoom": 12.729132798697304,
+        },
+        "viz_type": "deck_scatter",
+    }
+
+    print("Creating Scatterplot slice")
+    slc = Slice(
+        slice_name="Scatterplot",
+        viz_type="deck_scatter",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "point_unit": "square_m",
+        "row_limit": 5000,
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "mapbox_style": "mapbox://styles/mapbox/dark-v9",
+        "granularity_sqla": None,
+        "size": "count",
+        "viz_type": "deck_screengrid",
+        "time_range": "No filter",
+        "point_radius": "Auto",
+        "color_picker": {"a": 1, "r": 14, "b": 0, "g": 255},
+        "grid_size": 20,
+        "viewport": {
+            "zoom": 14.161641703941438,
+            "longitude": -122.41827069521386,
+            "bearing": -4.952916738791771,
+            "latitude": 37.76024135844065,
+            "pitch": 4.750411100577438,
+        },
+        "point_radius_fixed": {"type": "fix", "value": 2000},
+        "datasource": "5__table",
+        "time_grain_sqla": None,
+        "groupby": [],
+    }
+    print("Creating Screen Grid slice")
+    slc = Slice(
+        slice_name="Screen grid",
+        viz_type="deck_screengrid",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/streets-v9",
+        "granularity_sqla": None,
+        "size": "count",
+        "viz_type": "deck_hex",
+        "time_range": "No filter",
+        "point_radius_unit": "Pixels",
+        "point_radius": "Auto",
+        "color_picker": {"a": 1, "r": 14, "b": 0, "g": 255},
+        "grid_size": 40,
+        "extruded": True,
+        "viewport": {
+            "latitude": 37.789795085160335,
+            "pitch": 54.08961642447763,
+            "zoom": 13.835465702403654,
+            "longitude": -122.40632230075536,
+            "bearing": -2.3984797349335167,
+        },
+        "point_radius_fixed": {"type": "fix", "value": 2000},
+        "datasource": "5__table",
+        "time_grain_sqla": None,
+        "groupby": [],
+    }
+    print("Creating Hex slice")
+    slc = Slice(
+        slice_name="Hexagons",
+        viz_type="deck_hex",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/satellite-streets-v9",
+        "granularity_sqla": None,
+        "size": "count",
+        "viz_type": "deck_grid",
+        "point_radius_unit": "Pixels",
+        "point_radius": "Auto",
+        "time_range": "No filter",
+        "color_picker": {"a": 1, "r": 14, "b": 0, "g": 255},
+        "grid_size": 120,
+        "extruded": True,
+        "viewport": {
+            "longitude": -122.42066918995666,
+            "bearing": 155.80099696026355,
+            "zoom": 12.699690845482069,
+            "latitude": 37.7942314882596,
+            "pitch": 53.470800300695146,
+        },
+        "point_radius_fixed": {"type": "fix", "value": 2000},
+        "datasource": "5__table",
+        "time_grain_sqla": None,
+        "groupby": [],
+    }
+    print("Creating Grid slice")
+    slc = Slice(
+        slice_name="Grid",
+        viz_type="deck_grid",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    polygon_tbl = (
+        db.session.query(TBL).filter_by(table_name="sf_population_polygons").first()
+    )
+    slice_data = {
+        "datasource": "11__table",
+        "viz_type": "deck_polygon",
+        "slice_id": 41,
+        "granularity_sqla": None,
+        "time_grain_sqla": None,
+        "time_range": " : ",
+        "line_column": "contour",
+        "metric": {
+            "aggregate": "SUM",
+            "column": {
+                "column_name": "population",
+                "description": None,
+                "expression": None,
+                "filterable": True,
+                "groupby": True,
+                "id": 1332,
+                "is_dttm": False,
+                "optionName": "_col_population",
+                "python_date_format": None,
+                "type": "BIGINT",
+                "verbose_name": None,
+            },
+            "expressionType": "SIMPLE",
+            "hasCustomLabel": True,
+            "label": "Population",
+            "optionName": "metric_t2v4qbfiz1_w6qgpx4h2p",
+            "sqlExpression": None,
+        },
+        "line_type": "json",
+        "linear_color_scheme": "oranges",
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "viewport": {
+            "longitude": -122.43388541747726,
+            "latitude": 37.752020331384834,
+            "zoom": 11.133995608594631,
+            "bearing": 37.89506450385642,
+            "pitch": 60,
+            "width": 667,
+            "height": 906,
+            "altitude": 1.5,
+            "maxZoom": 20,
+            "minZoom": 0,
+            "maxPitch": 60,
+            "minPitch": 0,
+            "maxLatitude": 85.05113,
+            "minLatitude": -85.05113,
+        },
+        "reverse_long_lat": False,
+        "fill_color_picker": {"r": 3, "g": 65, "b": 73, "a": 1},
+        "stroke_color_picker": {"r": 0, "g": 122, "b": 135, "a": 1},
+        "filled": True,
+        "stroked": False,
+        "extruded": True,
+        "multiplier": 0.1,
+        "point_radius_fixed": {
+            "type": "metric",
+            "value": {
+                "aggregate": None,
+                "column": None,
+                "expressionType": "SQL",
+                "hasCustomLabel": None,
+                "label": "Density",
+                "optionName": "metric_c5rvwrzoo86_293h6yrv2ic",
+                "sqlExpression": "SUM(population)/SUM(area)",
+            },
+        },
+        "js_columns": [],
+        "js_data_mutator": "",
+        "js_tooltip": "",
+        "js_onclick_href": "",
+        "legend_format": ".1s",
+        "legend_position": "tr",
+    }
+
+    print("Creating Polygon slice")
+    slc = Slice(
+        slice_name="Polygons",
+        viz_type="deck_polygon",
+        datasource_type="table",
+        datasource_id=polygon_tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "datasource": "10__table",
+        "viz_type": "deck_arc",
+        "slice_id": 42,
+        "granularity_sqla": None,
+        "time_grain_sqla": None,
+        "time_range": " : ",
+        "start_spatial": {
+            "type": "latlong",
+            "latCol": "LATITUDE",
+            "lonCol": "LONGITUDE",
+        },
+        "end_spatial": {
+            "type": "latlong",
+            "latCol": "LATITUDE_DEST",
+            "lonCol": "LONGITUDE_DEST",
+        },
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "viewport": {
+            "altitude": 1.5,
+            "bearing": 8.546256357301871,
+            "height": 642,
+            "latitude": 44.596651438714254,
+            "longitude": -91.84340711201104,
+            "maxLatitude": 85.05113,
+            "maxPitch": 60,
+            "maxZoom": 20,
+            "minLatitude": -85.05113,
+            "minPitch": 0,
+            "minZoom": 0,
+            "pitch": 60,
+            "width": 997,
+            "zoom": 2.929837070560775,
+        },
+        "color_picker": {"r": 0, "g": 122, "b": 135, "a": 1},
+        "stroke_width": 1,
+    }
+
+    print("Creating Arc slice")
+    slc = Slice(
+        slice_name="Arcs",
+        viz_type="deck_arc",
+        datasource_type="table",
+        datasource_id=db.session.query(TBL).filter_by(table_name="flights").first().id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "datasource": "12__table",
+        "slice_id": 43,
+        "viz_type": "deck_path",
+        "time_grain_sqla": None,
+        "time_range": " : ",
+        "line_column": "path_json",
+        "line_type": "json",
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "viewport": {
+            "longitude": -122.18885402582598,
+            "latitude": 37.73671752604488,
+            "zoom": 9.51847667620428,
+            "bearing": 0,
+            "pitch": 0,
+            "width": 669,
+            "height": 1094,
+            "altitude": 1.5,
+            "maxZoom": 20,
+            "minZoom": 0,
+            "maxPitch": 60,
+            "minPitch": 0,
+            "maxLatitude": 85.05113,
+            "minLatitude": -85.05113,
+        },
+        "color_picker": {"r": 0, "g": 122, "b": 135, "a": 1},
+        "line_width": 150,
+        "reverse_long_lat": False,
+        "js_columns": ["color"],
+        "js_data_mutator": "data => data.map(d => ({\n"
+        "    ...d,\n"
+        "    color: colors.hexToRGB(d.extraProps.color)\n"
+        "}));",
+        "js_tooltip": "",
+        "js_onclick_href": "",
+    }
+
+    print("Creating Path slice")
+    slc = Slice(
+        slice_name="Path",
+        viz_type="deck_path",
+        datasource_type="table",
+        datasource_id=db.session.query(TBL)
+        .filter_by(table_name="bart_lines")
+        .first()
+        .id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+    slug = "deck"
+
+    print("Creating a dashboard")
+    title = "deck.gl Demo"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+    dash.published = True
+    js = POSITION_JSON
+    pos = json.loads(js)
+    update_slice_ids(pos, slices)
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.dashboard_title = title
+    dash.slug = slug
+    dash.slices = slices
+    db.session.merge(dash)
+    db.session.commit()
+
+
+if __name__ == "__main__":
+    load_deck_dash()

+ 141 - 0
BI/examples/energy.py

@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import textwrap
+
+import pandas as pd
+from sqlalchemy import Float, String
+from sqlalchemy.sql import column
+
+from superset import db
+from superset.connectors.sqla.models import SqlMetric
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import get_example_data, merge_slice, misc_dash_slices, TBL
+
+
+def load_energy(only_metadata: bool = False, force: bool = False) -> None:
+    """Loads an energy related dataset to use with sankey and graphs"""
+    tbl_name = "energy_usage"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("energy.json.gz")
+        pdf = pd.read_json(data)
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={"source": String(255), "target": String(255), "value": Float()},
+            index=False,
+        )
+
+    print("Creating table [wb_health_population] reference")
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Energy consumption"
+    tbl.database = database
+
+    if not any(col.metric_name == "sum__value" for col in tbl.metrics):
+        col = str(column("value").compile(db.engine))
+        tbl.metrics.append(
+            SqlMetric(metric_name="sum__value", expression=f"SUM({col})")
+        )
+
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()
+
+    slc = Slice(
+        slice_name="Energy Sankey",
+        viz_type="sankey",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=textwrap.dedent(
+            """\
+        {
+            "collapsed_fieldsets": "",
+            "groupby": [
+                "source",
+                "target"
+            ],
+            "metric": "sum__value",
+            "row_limit": "5000",
+            "slice_name": "Energy Sankey",
+            "viz_type": "sankey"
+        }
+        """
+        ),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)
+
+    slc = Slice(
+        slice_name="Energy Force Layout",
+        viz_type="directed_force",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=textwrap.dedent(
+            """\
+        {
+            "charge": "-500",
+            "collapsed_fieldsets": "",
+            "groupby": [
+                "source",
+                "target"
+            ],
+            "link_length": "200",
+            "metric": "sum__value",
+            "row_limit": "5000",
+            "slice_name": "Force",
+            "viz_type": "directed_force"
+        }
+        """
+        ),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)
+
+    slc = Slice(
+        slice_name="Heatmap",
+        viz_type="heatmap",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=textwrap.dedent(
+            """\
+        {
+            "all_columns_x": "source",
+            "all_columns_y": "target",
+            "canvas_image_rendering": "pixelated",
+            "collapsed_fieldsets": "",
+            "linear_color_scheme": "blue_white_yellow",
+            "metric": "sum__value",
+            "normalize_across": "heatmap",
+            "slice_name": "Heatmap",
+            "viz_type": "heatmap",
+            "xscale_interval": "1",
+            "yscale_interval": "1"
+        }
+        """
+        ),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 68 - 0
BI/examples/flights.py

@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pandas as pd
+from sqlalchemy import DateTime
+
+from superset import db
+from superset.utils import core as utils
+
+from .helpers import get_example_data, TBL
+
+
+def load_flights(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading random time series data from a zip file in the repo"""
+    tbl_name = "flights"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("flight_data.csv.gz", make_bytes=True)
+        pdf = pd.read_csv(data, encoding="latin-1")
+
+        # Loading airports info to join and get lat/long
+        airports_bytes = get_example_data("airports.csv.gz", make_bytes=True)
+        airports = pd.read_csv(airports_bytes, encoding="latin-1")
+        airports = airports.set_index("IATA_CODE")
+
+        pdf["ds"] = (
+            pdf.YEAR.map(str) + "-0" + pdf.MONTH.map(str) + "-0" + pdf.DAY.map(str)
+        )
+        pdf.ds = pd.to_datetime(pdf.ds)
+        del pdf["YEAR"]
+        del pdf["MONTH"]
+        del pdf["DAY"]
+
+        pdf = pdf.join(airports, on="ORIGIN_AIRPORT", rsuffix="_ORIG")
+        pdf = pdf.join(airports, on="DESTINATION_AIRPORT", rsuffix="_DEST")
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={"ds": DateTime},
+            index=False,
+        )
+
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Random set of flights in the US"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()
+    print("Done loading table!")

+ 78 - 0
BI/examples/helpers-backup.py

@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import os
+import zlib
+from io import BytesIO
+from typing import Any, Dict, List, Set
+from urllib import request
+
+from superset import app, db
+from superset.connectors.connector_registry import ConnectorRegistry
+from superset.models import core as models
+from superset.models.slice import Slice
+
+BASE_URL = "https://github.com/apache-superset/examples-data/blob/master/"
+
+# Shortcuts
+DB = models.Database
+
+TBL = ConnectorRegistry.sources["table"]
+
+config = app.config
+
+EXAMPLES_FOLDER = os.path.join(config["BASE_DIR"], "examples")
+
+misc_dash_slices: Set[str] = set()  # slices assembled in a 'Misc Chart' dashboard
+
+
+def update_slice_ids(layout_dict: Dict[Any, Any], slices: List[Slice]) -> None:
+    charts = [
+        component
+        for component in layout_dict.values()
+        if isinstance(component, dict) and component["type"] == "CHART"
+    ]
+    sorted_charts = sorted(charts, key= k["meta"]["chartId"])
+    for i, chart_component in enumerate(sorted_charts):
+        if i < len(slices):
+            chart_component["meta"]["chartId"] = int(slices[i].id)
+
+
+def merge_slice(slc: Slice) -> None:
+    o = db.session.query(Slice).filter_by(slice_name=slc.slice_name).first()
+    if o:
+        db.session.delete(o)
+    db.session.add(slc)
+    db.session.commit()
+
+
+def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
+    defaults_copy = defaults.copy()
+    defaults_copy.update(kwargs)
+    return json.dumps(defaults_copy, indent=4, sort_keys=True)
+
+
+def get_example_data(
+    filepath: str, is_gzip: bool = True, make_bytes: bool = False
+) -> BytesIO:
+    content = request.urlopen(f"{BASE_URL}{filepath}?raw=true").read()
+    if is_gzip:
+        content = zlib.decompress(content, zlib.MAX_WBITS | 16)
+    if make_bytes:
+        content = BytesIO(content)
+    return content

+ 78 - 0
BI/examples/helpers.py

@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import os
+import zlib
+from io import BytesIO
+from typing import Any, Dict, List, Set
+from urllib import request
+
+from superset import app, db
+from superset.connectors.connector_registry import ConnectorRegistry
+from superset.models import core as models
+from superset.models.slice import Slice
+
+BASE_URL = "https://github.com/apache-superset/examples-data/blob/master/"
+
+# Shortcuts
+DB = models.Database
+
+TBL = ConnectorRegistry.sources["table"]
+
+config = app.config
+
+EXAMPLES_FOLDER = os.path.join(config["BASE_DIR"], "examples")
+
+misc_dash_slices: Set[str] = set()  # slices assembled in a 'Misc Chart' dashboard
+
+
+def update_slice_ids(layout_dict: Dict[Any, Any], slices: List[Slice]) -> None:
+    charts = [
+        component
+        for component in layout_dict.values()
+        if isinstance(component, dict) and component["type"] == "CHART"
+    ]
+    sorted_charts = sorted(charts, key=lambda k: k["meta"]["chartId"])
+    for i, chart_component in enumerate(sorted_charts):
+        if i < len(slices):
+            chart_component["meta"]["chartId"] = int(slices[i].id)
+
+
+def merge_slice(slc: Slice) -> None:
+    o = db.session.query(Slice).filter_by(slice_name=slc.slice_name).first()
+    if o:
+        db.session.delete(o)
+    db.session.add(slc)
+    db.session.commit()
+
+
+def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
+    defaults_copy = defaults.copy()
+    defaults_copy.update(kwargs)
+    return json.dumps(defaults_copy, indent=4, sort_keys=True)
+
+
+def get_example_data(
+    filepath: str, is_gzip: bool = True, make_bytes: bool = False
+) -> BytesIO:
+    content = request.urlopen(f"{BASE_URL}{filepath}?raw=true").read()
+    if is_gzip:
+        content = zlib.decompress(content, zlib.MAX_WBITS | 16)
+    if make_bytes:
+        content = BytesIO(content)
+    return content

+ 116 - 0
BI/examples/long_lat.py

@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import datetime
+import random
+
+import geohash
+import pandas as pd
+from sqlalchemy import DateTime, Float, String
+
+from superset import db
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+)
+
+
+def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading lat/long data from a csv file in the repo"""
+    tbl_name = "long_lat"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("san_francisco.csv.gz", make_bytes=True)
+        pdf = pd.read_csv(data, encoding="utf-8")
+        start = datetime.datetime.now().replace(
+            hour=0, minute=0, second=0, microsecond=0
+        )
+        pdf["datetime"] = [
+            start + datetime.timedelta(hours=i * 24 / (len(pdf) - 1))
+            for i in range(len(pdf))
+        ]
+        pdf["occupancy"] = [random.randint(1, 6) for _ in range(len(pdf))]
+        pdf["radius_miles"] = [random.uniform(1, 3) for _ in range(len(pdf))]
+        pdf["geohash"] = pdf[["LAT", "LON"]].apply(lambda x: geohash.encode(*x), axis=1)
+        pdf["delimited"] = pdf["LAT"].map(str).str.cat(pdf["LON"].map(str), sep=",")
+        pdf.to_sql(  # pylint: disable=no-member
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "longitude": Float(),
+                "latitude": Float(),
+                "number": Float(),
+                "street": String(100),
+                "unit": String(10),
+                "city": String(50),
+                "district": String(50),
+                "region": String(50),
+                "postcode": Float(),
+                "id": String(100),
+                "datetime": DateTime(),
+                "occupancy": Float(),
+                "radius_miles": Float(),
+                "geohash": String(12),
+                "delimited": String(60),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print("Creating table reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "datetime"
+    obj.database = database
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "day",
+        "since": "2014-01-01",
+        "until": "now",
+        "viz_type": "mapbox",
+        "all_columns_x": "LON",
+        "all_columns_y": "LAT",
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "all_columns": ["occupancy"],
+        "row_limit": 500000,
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Mapbox Long/Lat",
+        viz_type="mapbox",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 224 - 0
BI/examples/misc_dashboard-backup.py

@@ -0,0 +1,224 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import misc_dash_slices, update_slice_ids
+
+DASH_SLUG = "misc_charts"
+
+
+def load_misc_dashboard() -> None:
+    """Loading a dashboard featuring misc charts"""
+
+    print("Creating the dashboard")
+    db.session.expunge_all()
+    dash = db.session.query(Dashboard).filter_by(slug=DASH_SLUG).first()
+
+    if not dash:
+        dash = Dashboard()
+    js = textwrap.dedent(
+        """\
+{
+    "CHART-BkeVbh8ANQ": {
+        "children": [],
+        "id": "CHART-BkeVbh8ANQ",
+        "meta": {
+            "chartId": 4004,
+            "height": 34,
+            "sliceName": "Multi Line",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-H1HYNzEANX": {
+        "children": [],
+        "id": "CHART-H1HYNzEANX",
+        "meta": {
+            "chartId": 3940,
+            "height": 50,
+            "sliceName": "Energy Sankey",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-HJOYVMV0E7": {
+        "children": [],
+        "id": "CHART-HJOYVMV0E7",
+        "meta": {
+            "chartId": 3969,
+            "height": 63,
+            "sliceName": "Mapbox Long/Lat",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-S1WYNz4AVX": {
+        "children": [],
+        "id": "CHART-S1WYNz4AVX",
+        "meta": {
+            "chartId": 3989,
+            "height": 25,
+            "sliceName": "Parallel Coordinates",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-r19KVMNCE7": {
+        "children": [],
+        "id": "CHART-r19KVMNCE7",
+        "meta": {
+            "chartId": 3971,
+            "height": 34,
+            "sliceName": "Calendar Heatmap multiformat 0",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-rJ4K4GV04Q": {
+        "children": [],
+        "id": "CHART-rJ4K4GV04Q",
+        "meta": {
+            "chartId": 3941,
+            "height": 63,
+            "sliceName": "Energy Force Layout",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-rkgF4G4A4X": {
+        "children": [],
+        "id": "CHART-rkgF4G4A4X",
+        "meta": {
+            "chartId": 3970,
+            "height": 25,
+            "sliceName": "Birth in France by department in 2016",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-rywK4GVR4X": {
+        "children": [],
+        "id": "CHART-rywK4GVR4X",
+        "meta": {
+            "chartId": 3942,
+            "height": 50,
+            "sliceName": "Heatmap",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "COLUMN-ByUFVf40EQ": {
+        "children": [
+            "CHART-rywK4GVR4X",
+            "CHART-HJOYVMV0E7"
+        ],
+        "id": "COLUMN-ByUFVf40EQ",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "COLUMN-rkmYVGN04Q": {
+        "children": [
+            "CHART-rJ4K4GV04Q",
+            "CHART-H1HYNzEANX"
+        ],
+        "id": "COLUMN-rkmYVGN04Q",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-SytNzNA4X",
+            "ROW-S1MK4M4A4X",
+            "ROW-HkFFEzVRVm"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+            "text": "Misc Charts"
+        },
+        "type": "HEADER"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-HkFFEzVRVm": {
+        "children": [
+            "CHART-r19KVMNCE7",
+            "CHART-BkeVbh8ANQ"
+        ],
+        "id": "ROW-HkFFEzVRVm",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-S1MK4M4A4X": {
+        "children": [
+            "COLUMN-rkmYVGN04Q",
+            "COLUMN-ByUFVf40EQ"
+        ],
+        "id": "ROW-S1MK4M4A4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-SytNzNA4X": {
+        "children": [
+            "CHART-rkgF4G4A4X",
+            "CHART-S1WYNz4AVX"
+        ],
+        "id": "ROW-SytNzNA4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    )
+    pos = json.loads(js)
+    slices = (
+        db.session.query(Slice).filter(Slice.slice_name.in_(misc_dash_slices)).all()
+    )
+    slices = sorted(slices, key= x.id)
+    update_slice_ids(pos, slices)
+    dash.dashboard_title = "Misc Charts"
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = DASH_SLUG
+    dash.slices = slices
+    db.session.merge(dash)
+    db.session.commit()

+ 224 - 0
BI/examples/misc_dashboard.py

@@ -0,0 +1,224 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import misc_dash_slices, update_slice_ids
+
+DASH_SLUG = "misc_charts"
+
+
+def load_misc_dashboard() -> None:
+    """Loading a dashboard featuring misc charts"""
+
+    print("Creating the dashboard")
+    db.session.expunge_all()
+    dash = db.session.query(Dashboard).filter_by(slug=DASH_SLUG).first()
+
+    if not dash:
+        dash = Dashboard()
+    js = textwrap.dedent(
+        """\
+{
+    "CHART-BkeVbh8ANQ": {
+        "children": [],
+        "id": "CHART-BkeVbh8ANQ",
+        "meta": {
+            "chartId": 4004,
+            "height": 34,
+            "sliceName": "Multi Line",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-H1HYNzEANX": {
+        "children": [],
+        "id": "CHART-H1HYNzEANX",
+        "meta": {
+            "chartId": 3940,
+            "height": 50,
+            "sliceName": "Energy Sankey",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-HJOYVMV0E7": {
+        "children": [],
+        "id": "CHART-HJOYVMV0E7",
+        "meta": {
+            "chartId": 3969,
+            "height": 63,
+            "sliceName": "Mapbox Long/Lat",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-S1WYNz4AVX": {
+        "children": [],
+        "id": "CHART-S1WYNz4AVX",
+        "meta": {
+            "chartId": 3989,
+            "height": 25,
+            "sliceName": "Parallel Coordinates",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-r19KVMNCE7": {
+        "children": [],
+        "id": "CHART-r19KVMNCE7",
+        "meta": {
+            "chartId": 3971,
+            "height": 34,
+            "sliceName": "Calendar Heatmap multiformat 0",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-rJ4K4GV04Q": {
+        "children": [],
+        "id": "CHART-rJ4K4GV04Q",
+        "meta": {
+            "chartId": 3941,
+            "height": 63,
+            "sliceName": "Energy Force Layout",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-rkgF4G4A4X": {
+        "children": [],
+        "id": "CHART-rkgF4G4A4X",
+        "meta": {
+            "chartId": 3970,
+            "height": 25,
+            "sliceName": "Birth in France by department in 2016",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-rywK4GVR4X": {
+        "children": [],
+        "id": "CHART-rywK4GVR4X",
+        "meta": {
+            "chartId": 3942,
+            "height": 50,
+            "sliceName": "Heatmap",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "COLUMN-ByUFVf40EQ": {
+        "children": [
+            "CHART-rywK4GVR4X",
+            "CHART-HJOYVMV0E7"
+        ],
+        "id": "COLUMN-ByUFVf40EQ",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "COLUMN-rkmYVGN04Q": {
+        "children": [
+            "CHART-rJ4K4GV04Q",
+            "CHART-H1HYNzEANX"
+        ],
+        "id": "COLUMN-rkmYVGN04Q",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-SytNzNA4X",
+            "ROW-S1MK4M4A4X",
+            "ROW-HkFFEzVRVm"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+            "text": "Misc Charts"
+        },
+        "type": "HEADER"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-HkFFEzVRVm": {
+        "children": [
+            "CHART-r19KVMNCE7",
+            "CHART-BkeVbh8ANQ"
+        ],
+        "id": "ROW-HkFFEzVRVm",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-S1MK4M4A4X": {
+        "children": [
+            "COLUMN-rkmYVGN04Q",
+            "COLUMN-ByUFVf40EQ"
+        ],
+        "id": "ROW-S1MK4M4A4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-SytNzNA4X": {
+        "children": [
+            "CHART-rkgF4G4A4X",
+            "CHART-S1WYNz4AVX"
+        ],
+        "id": "ROW-SytNzNA4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    )
+    pos = json.loads(js)
+    slices = (
+        db.session.query(Slice).filter(Slice.slice_name.in_(misc_dash_slices)).all()
+    )
+    slices = sorted(slices, key=lambda x: x.id)
+    update_slice_ids(pos, slices)
+    dash.dashboard_title = "Misc Charts"
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = DASH_SLUG
+    dash.slices = slices
+    db.session.merge(dash)
+    db.session.commit()

+ 58 - 0
BI/examples/multi_line.py

@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+from superset import db
+from superset.models.slice import Slice
+
+from .birth_names import load_birth_names
+from .helpers import merge_slice, misc_dash_slices
+from .world_bank import load_world_bank_health_n_pop
+
+
+def load_multi_line(only_metadata: bool = False) -> None:
+    load_world_bank_health_n_pop(only_metadata)
+    load_birth_names(only_metadata)
+    ids = [
+        row.id
+        for row in db.session.query(Slice).filter(
+            Slice.slice_name.in_(["Growth Rate", "Trends"])
+        )
+    ]
+
+    slc = Slice(
+        datasource_type="table",  # not true, but needed
+        datasource_id=1,  # cannot be empty
+        slice_name="Multi Line",
+        viz_type="line_multi",
+        params=json.dumps(
+            {
+                "slice_name": "Multi Line",
+                "viz_type": "line_multi",
+                "line_charts": [ids[0]],
+                "line_charts_2": [ids[1]],
+                "since": "1970",
+                "until": "1995",
+                "prefix_metric_with_slice_name": True,
+                "show_legend": False,
+                "x_axis_format": "%Y",
+            }
+        ),
+    )
+
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 117 - 0
BI/examples/multiformat_time_series.py

@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import Dict, Optional, Tuple
+
+import pandas as pd
+from sqlalchemy import BigInteger, Date, DateTime, String
+
+from superset import db
+from superset.models.slice import Slice
+from superset.utils.core import get_example_database
+
+from .helpers import (
+    config,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+)
+
+
+def load_multiformat_time_series(
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    """Loading time series data from a zip file in the repo"""
+    tbl_name = "multiformat_time_series"
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("multiformat_time_series.json.gz")
+        pdf = pd.read_json(data)
+
+        pdf.ds = pd.to_datetime(pdf.ds, unit="s")
+        pdf.ds2 = pd.to_datetime(pdf.ds2, unit="s")
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "ds": Date,
+                "ds2": DateTime,
+                "epoch_s": BigInteger,
+                "epoch_ms": BigInteger,
+                "string0": String(100),
+                "string1": String(100),
+                "string2": String(100),
+                "string3": String(100),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print(f"Creating table [{tbl_name}] reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "ds"
+    obj.database = database
+    dttm_and_expr_dict: Dict[str, Tuple[Optional[str], None]] = {
+        "ds": (None, None),
+        "ds2": (None, None),
+        "epoch_s": ("epoch_s", None),
+        "epoch_ms": ("epoch_ms", None),
+        "string2": ("%Y%m%d-%H%M%S", None),
+        "string1": ("%Y-%m-%d^%H:%M:%S", None),
+        "string0": ("%Y-%m-%d %H:%M:%S.%f", None),
+        "string3": ("%Y/%m/%d%H:%M:%S.%f", None),
+    }
+    for col in obj.columns:
+        dttm_and_expr = dttm_and_expr_dict[col.column_name]
+        col.python_date_format = dttm_and_expr[0]
+        col.dbatabase_expr = dttm_and_expr[1]
+        col.is_dttm = True
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    print("Creating Heatmap charts")
+    for i, col in enumerate(tbl.columns):
+        slice_data = {
+            "metrics": ["count"],
+            "granularity_sqla": col.column_name,
+            "row_limit": config["ROW_LIMIT"],
+            "since": "2015",
+            "until": "2016",
+            "viz_type": "cal_heatmap",
+            "domain_granularity": "month",
+            "subdomain_granularity": "day",
+        }
+
+        slc = Slice(
+            slice_name=f"Calendar Heatmap multiformat {i}",
+            viz_type="cal_heatmap",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(slice_data),
+        )
+        merge_slice(slc)
+    misc_dash_slices.add("Calendar Heatmap multiformat 0")

+ 60 - 0
BI/examples/paris.py

@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+import pandas as pd
+from sqlalchemy import String, Text
+
+from superset import db
+from superset.utils import core as utils
+
+from .helpers import get_example_data, TBL
+
+
+def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> None:
+    tbl_name = "paris_iris_mapping"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("paris_iris.json.gz")
+        df = pd.read_json(data)
+        df["features"] = df.features.map(json.dumps)
+
+        df.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "color": String(255),
+                "name": String(255),
+                "features": Text,
+                "type": Text,
+            },
+            index=False,
+        )
+
+    print("Creating table {} reference".format(tbl_name))
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Map of Paris"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()

+ 81 - 0
BI/examples/random_time_series.py

@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pandas as pd
+from sqlalchemy import DateTime
+
+from superset import db
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import config, get_example_data, get_slice_json, merge_slice, TBL
+
+
+def load_random_time_series_data(
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    """Loading random time series data from a zip file in the repo"""
+    tbl_name = "random_time_series"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("random_time_series.json.gz")
+        pdf = pd.read_json(data)
+        pdf.ds = pd.to_datetime(pdf.ds, unit="s")
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={"ds": DateTime},
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print(f"Creating table [{tbl_name}] reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "ds"
+    obj.database = database
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "day",
+        "row_limit": config["ROW_LIMIT"],
+        "since": "2019-01-01",
+        "until": "2019-02-01",
+        "metric": "count",
+        "viz_type": "cal_heatmap",
+        "domain_granularity": "month",
+        "subdomain_granularity": "day",
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Calendar Heatmap",
+        viz_type="cal_heatmap",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)

+ 62 - 0
BI/examples/sf_population_polygons.py

@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+import pandas as pd
+from sqlalchemy import BigInteger, Float, Text
+
+from superset import db
+from superset.utils import core as utils
+
+from .helpers import get_example_data, TBL
+
+
+def load_sf_population_polygons(
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    tbl_name = "sf_population_polygons"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("sf_population.json.gz")
+        df = pd.read_json(data)
+        df["contour"] = df.contour.map(json.dumps)
+
+        df.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "zipcode": BigInteger,
+                "population": BigInteger,
+                "contour": Text,
+                "area": Float,
+            },
+            index=False,
+        )
+
+    print("Creating table {} reference".format(tbl_name))
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Population density of San Francisco"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()

+ 342 - 0
BI/examples/tabbed_dashboard-backup.py

@@ -0,0 +1,342 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import update_slice_ids
+
+
+def load_tabbed_dashboard(_: bool = False) -> None:
+    """Creating a tabbed dashboard"""
+
+    print("Creating a dashboard with nested tabs")
+    slug = "tabbed_dash"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+
+    # reuse charts in "World's Bank Data and create
+    # new dashboard with nested tabs
+    tabbed_dash_slices = set()
+    tabbed_dash_slices.add("Region Filter")
+    tabbed_dash_slices.add("Growth Rate")
+    tabbed_dash_slices.add("Treemap")
+    tabbed_dash_slices.add("Box plot")
+
+    js = textwrap.dedent(
+        """\
+    {
+      "CHART-c0EjR-OZ0n": {
+        "children": [],
+        "id": "CHART-c0EjR-OZ0n",
+        "meta": {
+          "chartId": 870,
+          "height": 50,
+          "sliceName": "Box plot",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "ROW-7G2o5uDvfo"
+        ],
+        "type": "CHART"
+      },
+      "CHART-dxV7Il74hH": {
+        "children": [],
+        "id": "CHART-dxV7Il74hH",
+        "meta": {
+          "chartId": 797,
+          "height": 50,
+          "sliceName": "Treemap",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1",
+          "ROW-7ygtDczaQ"
+        ],
+        "type": "CHART"
+      },
+      "CHART-jJ5Yj1Ptaz": {
+        "children": [],
+        "id": "CHART-jJ5Yj1Ptaz",
+        "meta": {
+          "chartId": 789,
+          "height": 50,
+          "sliceName": "World's Population",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7",
+          "ROW-G73z9PIHn"
+        ],
+        "type": "CHART"
+      },
+      "CHART-z4gmEuCqQ5": {
+        "children": [],
+        "id": "CHART-z4gmEuCqQ5",
+        "meta": {
+          "chartId": 788,
+          "height": 50,
+          "sliceName": "Region Filter",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922",
+          "ROW-LCjsdSetJ"
+        ],
+        "type": "CHART"
+      },
+      "DASHBOARD_VERSION_KEY": "v2",
+      "GRID_ID": {
+        "children": [],
+        "id": "GRID_ID",
+        "type": "GRID"
+      },
+      "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+          "text": "Tabbed Dashboard"
+        },
+        "type": "HEADER"
+      },
+      "ROOT_ID": {
+        "children": [
+          "TABS-lV0r00f4H1"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+      },
+      "ROW-7G2o5uDvfo": {
+        "children": [
+          "CHART-c0EjR-OZ0n"
+        ],
+        "id": "ROW-7G2o5uDvfo",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "ROW"
+      },
+      "ROW-7ygtDczaQ": {
+        "children": [
+          "CHART-dxV7Il74hH"
+        ],
+        "id": "ROW-7ygtDczaQ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1"
+        ],
+        "type": "ROW"
+      },
+      "ROW-G73z9PIHn": {
+        "children": [
+          "CHART-jJ5Yj1Ptaz"
+        ],
+        "id": "ROW-G73z9PIHn",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7"
+        ],
+        "type": "ROW"
+      },
+      "ROW-LCjsdSetJ": {
+        "children": [
+          "CHART-z4gmEuCqQ5"
+        ],
+        "id": "ROW-LCjsdSetJ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922"
+        ],
+        "type": "ROW"
+      },
+      "TAB-EcNm_wh922": {
+        "children": [
+          "ROW-LCjsdSetJ"
+        ],
+        "id": "TAB-EcNm_wh922",
+        "meta": {
+          "text": "row tab 1"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TAB-NF3dlrWGS": {
+        "children": [
+          "ROW-7G2o5uDvfo",
+          "TABS-CSjo6VfNrj"
+        ],
+        "id": "TAB-NF3dlrWGS",
+        "meta": {
+          "text": "Tab A"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-gcQJxApOZS": {
+        "children": [
+          "TABS-afnrUvdxYF"
+        ],
+        "id": "TAB-gcQJxApOZS",
+        "meta": {
+          "text": "Tab B"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-jNNd4WWar1": {
+        "children": [
+          "ROW-7ygtDczaQ"
+        ],
+        "id": "TAB-jNNd4WWar1",
+        "meta": {
+          "text": "New Tab"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF"
+        ],
+        "type": "TAB"
+      },
+      "TAB-z81Q87PD7": {
+        "children": [
+          "ROW-G73z9PIHn"
+        ],
+        "id": "TAB-z81Q87PD7",
+        "meta": {
+          "text": "row tab 2"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TABS-CSjo6VfNrj": {
+        "children": [
+          "TAB-EcNm_wh922",
+          "TAB-z81Q87PD7"
+        ],
+        "id": "TABS-CSjo6VfNrj",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-afnrUvdxYF": {
+        "children": [
+          "TAB-jNNd4WWar1"
+        ],
+        "id": "TABS-afnrUvdxYF",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-lV0r00f4H1": {
+        "children": [
+          "TAB-NF3dlrWGS",
+          "TAB-gcQJxApOZS"
+        ],
+        "id": "TABS-lV0r00f4H1",
+        "meta": {},
+        "parents": [
+          "ROOT_ID"
+        ],
+        "type": "TABS"
+      }
+    }
+        """
+    )
+    pos = json.loads(js)
+    slices = [
+        db.session.query(Slice).filter_by(slice_name=name).first()
+        for name in tabbed_dash_slices
+    ]
+
+    slices = sorted(slices, key= x.id)
+    update_slice_ids(pos, slices)
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slices = slices
+    dash.dashboard_title = "Tabbed Dashboard"
+    dash.slug = slug
+
+    db.session.merge(dash)
+    db.session.commit()

+ 342 - 0
BI/examples/tabbed_dashboard.py

@@ -0,0 +1,342 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import update_slice_ids
+
+
+def load_tabbed_dashboard(_: bool = False) -> None:
+    """Creating a tabbed dashboard"""
+
+    print("Creating a dashboard with nested tabs")
+    slug = "tabbed_dash"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+
+    # reuse charts in "World's Bank Data and create
+    # new dashboard with nested tabs
+    tabbed_dash_slices = set()
+    tabbed_dash_slices.add("Region Filter")
+    tabbed_dash_slices.add("Growth Rate")
+    tabbed_dash_slices.add("Treemap")
+    tabbed_dash_slices.add("Box plot")
+
+    js = textwrap.dedent(
+        """\
+    {
+      "CHART-c0EjR-OZ0n": {
+        "children": [],
+        "id": "CHART-c0EjR-OZ0n",
+        "meta": {
+          "chartId": 870,
+          "height": 50,
+          "sliceName": "Box plot",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "ROW-7G2o5uDvfo"
+        ],
+        "type": "CHART"
+      },
+      "CHART-dxV7Il74hH": {
+        "children": [],
+        "id": "CHART-dxV7Il74hH",
+        "meta": {
+          "chartId": 797,
+          "height": 50,
+          "sliceName": "Treemap",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1",
+          "ROW-7ygtDczaQ"
+        ],
+        "type": "CHART"
+      },
+      "CHART-jJ5Yj1Ptaz": {
+        "children": [],
+        "id": "CHART-jJ5Yj1Ptaz",
+        "meta": {
+          "chartId": 789,
+          "height": 50,
+          "sliceName": "World's Population",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7",
+          "ROW-G73z9PIHn"
+        ],
+        "type": "CHART"
+      },
+      "CHART-z4gmEuCqQ5": {
+        "children": [],
+        "id": "CHART-z4gmEuCqQ5",
+        "meta": {
+          "chartId": 788,
+          "height": 50,
+          "sliceName": "Region Filter",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922",
+          "ROW-LCjsdSetJ"
+        ],
+        "type": "CHART"
+      },
+      "DASHBOARD_VERSION_KEY": "v2",
+      "GRID_ID": {
+        "children": [],
+        "id": "GRID_ID",
+        "type": "GRID"
+      },
+      "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+          "text": "Tabbed Dashboard"
+        },
+        "type": "HEADER"
+      },
+      "ROOT_ID": {
+        "children": [
+          "TABS-lV0r00f4H1"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+      },
+      "ROW-7G2o5uDvfo": {
+        "children": [
+          "CHART-c0EjR-OZ0n"
+        ],
+        "id": "ROW-7G2o5uDvfo",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "ROW"
+      },
+      "ROW-7ygtDczaQ": {
+        "children": [
+          "CHART-dxV7Il74hH"
+        ],
+        "id": "ROW-7ygtDczaQ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1"
+        ],
+        "type": "ROW"
+      },
+      "ROW-G73z9PIHn": {
+        "children": [
+          "CHART-jJ5Yj1Ptaz"
+        ],
+        "id": "ROW-G73z9PIHn",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7"
+        ],
+        "type": "ROW"
+      },
+      "ROW-LCjsdSetJ": {
+        "children": [
+          "CHART-z4gmEuCqQ5"
+        ],
+        "id": "ROW-LCjsdSetJ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922"
+        ],
+        "type": "ROW"
+      },
+      "TAB-EcNm_wh922": {
+        "children": [
+          "ROW-LCjsdSetJ"
+        ],
+        "id": "TAB-EcNm_wh922",
+        "meta": {
+          "text": "row tab 1"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TAB-NF3dlrWGS": {
+        "children": [
+          "ROW-7G2o5uDvfo",
+          "TABS-CSjo6VfNrj"
+        ],
+        "id": "TAB-NF3dlrWGS",
+        "meta": {
+          "text": "Tab A"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-gcQJxApOZS": {
+        "children": [
+          "TABS-afnrUvdxYF"
+        ],
+        "id": "TAB-gcQJxApOZS",
+        "meta": {
+          "text": "Tab B"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-jNNd4WWar1": {
+        "children": [
+          "ROW-7ygtDczaQ"
+        ],
+        "id": "TAB-jNNd4WWar1",
+        "meta": {
+          "text": "New Tab"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF"
+        ],
+        "type": "TAB"
+      },
+      "TAB-z81Q87PD7": {
+        "children": [
+          "ROW-G73z9PIHn"
+        ],
+        "id": "TAB-z81Q87PD7",
+        "meta": {
+          "text": "row tab 2"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TABS-CSjo6VfNrj": {
+        "children": [
+          "TAB-EcNm_wh922",
+          "TAB-z81Q87PD7"
+        ],
+        "id": "TABS-CSjo6VfNrj",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-afnrUvdxYF": {
+        "children": [
+          "TAB-jNNd4WWar1"
+        ],
+        "id": "TABS-afnrUvdxYF",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-lV0r00f4H1": {
+        "children": [
+          "TAB-NF3dlrWGS",
+          "TAB-gcQJxApOZS"
+        ],
+        "id": "TABS-lV0r00f4H1",
+        "meta": {},
+        "parents": [
+          "ROOT_ID"
+        ],
+        "type": "TABS"
+      }
+    }
+        """
+    )
+    pos = json.loads(js)
+    slices = [
+        db.session.query(Slice).filter_by(slice_name=name).first()
+        for name in tabbed_dash_slices
+    ]
+
+    slices = sorted(slices, key=lambda x: x.id)
+    update_slice_ids(pos, slices)
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slices = slices
+    dash.dashboard_title = "Tabbed Dashboard"
+    dash.slug = slug
+
+    db.session.merge(dash)
+    db.session.commit()

+ 163 - 0
BI/examples/unicode_test_data.py

@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import datetime
+import json
+import random
+
+import pandas as pd
+from sqlalchemy import Date, Float, String
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    config,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    TBL,
+    update_slice_ids,
+)
+
+
+def load_unicode_test_data(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading unicode test dataset from a csv file in the repo"""
+    tbl_name = "unicode_test"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data(
+            "unicode_utf8_unixnl_test.csv", is_gzip=False, make_bytes=True
+        )
+        df = pd.read_csv(data, encoding="utf-8")
+        # generate date/numeric data
+        df["dttm"] = datetime.datetime.now().date()
+        df["value"] = [random.randint(1, 100) for _ in range(len(df))]
+        df.to_sql(  # pylint: disable=no-member
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "phrase": String(500),
+                "short_phrase": String(10),
+                "with_missing": String(100),
+                "dttm": Date(),
+                "value": Float(),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print("Creating table [unicode_test] reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "dttm"
+    obj.database = database
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "dttm",
+        "groupby": [],
+        "metric": {
+            "aggregate": "SUM",
+            "column": {"column_name": "value"},
+            "expressionType": "SIMPLE",
+            "label": "Value",
+        },
+        "row_limit": config["ROW_LIMIT"],
+        "since": "100 years ago",
+        "until": "now",
+        "viz_type": "word_cloud",
+        "size_from": "10",
+        "series": "short_phrase",
+        "size_to": "70",
+        "rotation": "square",
+        "limit": "100",
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Unicode Cloud",
+        viz_type="word_cloud",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+
+    print("Creating a dashboard")
+    dash = db.session.query(Dashboard).filter_by(slug="unicode-test").first()
+
+    if not dash:
+        dash = Dashboard()
+    js = """\
+{
+    "CHART-Hkx6154FEm": {
+        "children": [],
+        "id": "CHART-Hkx6154FEm",
+        "meta": {
+            "chartId": 2225,
+            "height": 30,
+            "sliceName": "slice 1",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-SyT19EFEQ"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-SyT19EFEQ": {
+        "children": [
+            "CHART-Hkx6154FEm"
+        ],
+        "id": "ROW-SyT19EFEQ",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    dash.dashboard_title = "Unicode Test"
+    pos = json.loads(js)
+    update_slice_ids(pos, [slc])
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = "unicode-test"
+    dash.slices = [slc]
+    db.session.merge(dash)
+    db.session.commit()

+ 574 - 0
BI/examples/world_bank.py

@@ -0,0 +1,574 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import os
+import textwrap
+
+import pandas as pd
+from sqlalchemy import DateTime, String
+from sqlalchemy.sql import column
+
+from superset import db
+from superset.connectors.sqla.models import SqlMetric
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    config,
+    EXAMPLES_FOLDER,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+    update_slice_ids,
+)
+
+
+def load_world_bank_health_n_pop(  # pylint: disable=too-many-locals
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    """Loads the world bank health dataset, slices and a dashboard"""
+    tbl_name = "wb_health_population"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("countries.json.gz")
+        pdf = pd.read_json(data)
+        pdf.columns = [col.replace(".", "_") for col in pdf.columns]
+        pdf.year = pd.to_datetime(pdf.year)
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=50,
+            dtype={
+                "year": DateTime(),
+                "country_code": String(3),
+                "country_name": String(255),
+                "region": String(255),
+            },
+            index=False,
+        )
+
+    print("Creating table [wb_health_population] reference")
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = utils.readfile(os.path.join(EXAMPLES_FOLDER, "countries.md"))
+    tbl.main_dttm_col = "year"
+    tbl.database = database
+    tbl.filter_select_enabled = True
+
+    metrics = [
+        "sum__SP_POP_TOTL",
+        "sum__SH_DYN_AIDS",
+        "sum__SH_DYN_AIDS",
+        "sum__SP_RUR_TOTL_ZS",
+        "sum__SP_DYN_LE00_IN",
+        "sum__SP_RUR_TOTL",
+    ]
+    for metric in metrics:
+        if not any(col.metric_name == metric for col in tbl.metrics):
+            aggr_func = metric[:3]
+            col = str(column(metric[5:]).compile(db.engine))
+            tbl.metrics.append(
+                SqlMetric(metric_name=metric, expression=f"{aggr_func}({col})")
+            )
+
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()
+
+    metric = "sum__SP_POP_TOTL"
+    metrics = ["sum__SP_POP_TOTL"]
+    secondary_metric = {
+        "aggregate": "SUM",
+        "column": {
+            "column_name": "SP_RUR_TOTL",
+            "optionName": "_col_SP_RUR_TOTL",
+            "type": "DOUBLE",
+        },
+        "expressionType": "SIMPLE",
+        "hasCustomLabel": True,
+        "label": "Rural Population",
+    }
+
+    defaults = {
+        "compare_lag": "10",
+        "compare_suffix": "o10Y",
+        "limit": "25",
+        "granularity_sqla": "year",
+        "groupby": [],
+        "row_limit": config["ROW_LIMIT"],
+        "since": "2014-01-01",
+        "until": "2014-01-02",
+        "time_range": "2014-01-01 : 2014-01-02",
+        "markup_type": "markdown",
+        "country_fieldtype": "cca3",
+        "entity": "country_code",
+        "show_bubbles": True,
+    }
+
+    print("Creating slices")
+    slices = [
+        Slice(
+            slice_name="Region Filter",
+            viz_type="filter_box",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="filter_box",
+                date_filter=False,
+                filter_configs=[
+                    {
+                        "asc": False,
+                        "clearable": True,
+                        "column": "region",
+                        "key": "2s98dfu",
+                        "metric": "sum__SP_POP_TOTL",
+                        "multiple": True,
+                    },
+                    {
+                        "asc": False,
+                        "clearable": True,
+                        "key": "li3j2lk",
+                        "column": "country_name",
+                        "metric": "sum__SP_POP_TOTL",
+                        "multiple": True,
+                    },
+                ],
+            ),
+        ),
+        Slice(
+            slice_name="World's Population",
+            viz_type="big_number",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="2000",
+                viz_type="big_number",
+                compare_lag="10",
+                metric="sum__SP_POP_TOTL",
+                compare_suffix="over 10Y",
+            ),
+        ),
+        Slice(
+            slice_name="Most Populated Countries",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="table",
+                metrics=["sum__SP_POP_TOTL"],
+                groupby=["country_name"],
+            ),
+        ),
+        Slice(
+            slice_name="Growth Rate",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="line",
+                since="1960-01-01",
+                metrics=["sum__SP_POP_TOTL"],
+                num_period_compare="10",
+                groupby=["country_name"],
+            ),
+        ),
+        Slice(
+            slice_name="% Rural",
+            viz_type="world_map",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="world_map",
+                metric="sum__SP_RUR_TOTL_ZS",
+                num_period_compare="10",
+                secondary_metric=secondary_metric,
+            ),
+        ),
+        Slice(
+            slice_name="Life Expectancy VS Rural %",
+            viz_type="bubble",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="bubble",
+                since="2011-01-01",
+                until="2011-01-02",
+                series="region",
+                limit=0,
+                entity="country_name",
+                x="sum__SP_RUR_TOTL_ZS",
+                y="sum__SP_DYN_LE00_IN",
+                size="sum__SP_POP_TOTL",
+                max_bubble_size="50",
+                adhoc_filters=[
+                    {
+                        "clause": "WHERE",
+                        "expressionType": "SIMPLE",
+                        "filterOptionName": "2745eae5",
+                        "comparator": [
+                            "TCA",
+                            "MNP",
+                            "DMA",
+                            "MHL",
+                            "MCO",
+                            "SXM",
+                            "CYM",
+                            "TUV",
+                            "IMY",
+                            "KNA",
+                            "ASM",
+                            "ADO",
+                            "AMA",
+                            "PLW",
+                        ],
+                        "operator": "NOT IN",
+                        "subject": "country_code",
+                    }
+                ],
+            ),
+        ),
+        Slice(
+            slice_name="Rural Breakdown",
+            viz_type="sunburst",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="sunburst",
+                groupby=["region", "country_name"],
+                since="2011-01-01",
+                until="2011-01-01",
+                metric=metric,
+                secondary_metric=secondary_metric,
+            ),
+        ),
+        Slice(
+            slice_name="World's Pop Growth",
+            viz_type="area",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="1960-01-01",
+                until="now",
+                viz_type="area",
+                groupby=["region"],
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Box plot",
+            viz_type="box_plot",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="1960-01-01",
+                until="now",
+                whisker_options="Min/max (no outliers)",
+                x_ticks_layout="staggered",
+                viz_type="box_plot",
+                groupby=["region"],
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Treemap",
+            viz_type="treemap",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="1960-01-01",
+                until="now",
+                viz_type="treemap",
+                metrics=["sum__SP_POP_TOTL"],
+                groupby=["region", "country_code"],
+            ),
+        ),
+        Slice(
+            slice_name="Parallel Coordinates",
+            viz_type="para",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="2011-01-01",
+                until="2011-01-01",
+                viz_type="para",
+                limit=100,
+                metrics=["sum__SP_POP_TOTL", "sum__SP_RUR_TOTL_ZS", "sum__SH_DYN_AIDS"],
+                secondary_metric="sum__SP_POP_TOTL",
+                series="country_name",
+            ),
+        ),
+    ]
+    misc_dash_slices.add(slices[-1].slice_name)
+    for slc in slices:
+        merge_slice(slc)
+
+    print("Creating a World's Health Bank dashboard")
+    dash_name = "World Bank's Data"
+    slug = "world_health"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+    dash.published = True
+    js = textwrap.dedent(
+        """\
+{
+    "CHART-36bfc934": {
+        "children": [],
+        "id": "CHART-36bfc934",
+        "meta": {
+            "chartId": 40,
+            "height": 25,
+            "sliceName": "Region Filter",
+            "width": 2
+        },
+        "type": "CHART"
+    },
+    "CHART-37982887": {
+        "children": [],
+        "id": "CHART-37982887",
+        "meta": {
+            "chartId": 41,
+            "height": 25,
+            "sliceName": "World's Population",
+            "width": 2
+        },
+        "type": "CHART"
+    },
+    "CHART-17e0f8d8": {
+        "children": [],
+        "id": "CHART-17e0f8d8",
+        "meta": {
+            "chartId": 42,
+            "height": 92,
+            "sliceName": "Most Populated Countries",
+            "width": 3
+        },
+        "type": "CHART"
+    },
+    "CHART-2ee52f30": {
+        "children": [],
+        "id": "CHART-2ee52f30",
+        "meta": {
+            "chartId": 43,
+            "height": 38,
+            "sliceName": "Growth Rate",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-2d5b6871": {
+        "children": [],
+        "id": "CHART-2d5b6871",
+        "meta": {
+            "chartId": 44,
+            "height": 52,
+            "sliceName": "% Rural",
+            "width": 7
+        },
+        "type": "CHART"
+    },
+    "CHART-0fd0d252": {
+        "children": [],
+        "id": "CHART-0fd0d252",
+        "meta": {
+            "chartId": 45,
+            "height": 50,
+            "sliceName": "Life Expectancy VS Rural %",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-97f4cb48": {
+        "children": [],
+        "id": "CHART-97f4cb48",
+        "meta": {
+            "chartId": 46,
+            "height": 38,
+            "sliceName": "Rural Breakdown",
+            "width": 3
+        },
+        "type": "CHART"
+    },
+    "CHART-b5e05d6f": {
+        "children": [],
+        "id": "CHART-b5e05d6f",
+        "meta": {
+            "chartId": 47,
+            "height": 50,
+            "sliceName": "World's Pop Growth",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-e76e9f5f": {
+        "children": [],
+        "id": "CHART-e76e9f5f",
+        "meta": {
+            "chartId": 48,
+            "height": 50,
+            "sliceName": "Box plot",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-a4808bba": {
+        "children": [],
+        "id": "CHART-a4808bba",
+        "meta": {
+            "chartId": 49,
+            "height": 50,
+            "sliceName": "Treemap",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "COLUMN-071bbbad": {
+        "children": [
+            "ROW-1e064e3c",
+            "ROW-afdefba9"
+        ],
+        "id": "COLUMN-071bbbad",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 9
+        },
+        "type": "COLUMN"
+    },
+    "COLUMN-fe3914b8": {
+        "children": [
+            "CHART-36bfc934",
+            "CHART-37982887"
+        ],
+        "id": "COLUMN-fe3914b8",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 2
+        },
+        "type": "COLUMN"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-46632bc2",
+            "ROW-3fa26c5d",
+            "ROW-812b3f13"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+            "text": "World's Bank Data"
+        },
+        "type": "HEADER"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-1e064e3c": {
+        "children": [
+            "COLUMN-fe3914b8",
+            "CHART-2d5b6871"
+        ],
+        "id": "ROW-1e064e3c",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-3fa26c5d": {
+        "children": [
+            "CHART-b5e05d6f",
+            "CHART-0fd0d252"
+        ],
+        "id": "ROW-3fa26c5d",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-46632bc2": {
+        "children": [
+            "COLUMN-071bbbad",
+            "CHART-17e0f8d8"
+        ],
+        "id": "ROW-46632bc2",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-812b3f13": {
+        "children": [
+            "CHART-a4808bba",
+            "CHART-e76e9f5f"
+        ],
+        "id": "ROW-812b3f13",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-afdefba9": {
+        "children": [
+            "CHART-2ee52f30",
+            "CHART-97f4cb48"
+        ],
+        "id": "ROW-afdefba9",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    )
+    pos = json.loads(js)
+    update_slice_ids(pos, slices)
+
+    dash.dashboard_title = dash_name
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = slug
+
+    dash.slices = slices[:-1]
+    db.session.merge(dash)
+    db.session.commit()

+ 580 - 0
BI/income_disparity_final_version_2.py

@@ -0,0 +1,580 @@
+# -*- coding: utf-8 -*-
+"""income_disparity.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1upuHuQ3gWDkpbvkvHl2uTQlSv20JZnf2
+"""
+
+
+#!pip install pandas-datareader
+import wbdata
+import datetime
+import numpy as np
+import pandas as pd
+from pandas_datareader import wb
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LinearRegression as lr
+from matplotlib.pyplot import MultipleLocator
+
+
+# =============================================================================
+# # Part 1: API Integration
+# =============================================================================
+
+# =============================================================================
+# # API method 1: using wbdata module
+# =============================================================================
+
+# #searching for countries index using names
+# print(wbdata.search_countries('United Kingdom'))
+
+# list of countries
+countries = ["USA", "BEL", "BRA", "COL", "FRA", "DEU", "GRC", "IDN", "IRL", "MEX", "NLD", "RUS"]
+# date period
+dates = datetime.datetime(2008, 1, 1), datetime.datetime(2018, 1, 1)
+
+# data object
+indicators = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
+             'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
+             'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}
+
+# getting data from these countries
+raw_data = wbdata.get_dataframe(indicators, country=countries, data_date=dates, convert_date=True)
+
+raw_unstacked_data = raw_data.unstack(level=0)
+
+# printing our data object
+# print(raw_data)
+# print(raw_unstacked_data)
+
+# =============================================================================
+# # API method 2: using from pandas.datareader import wb, convert the data object to a DataFrame 
+# =============================================================================
+
+# view all data
+pd.set_option('display.max_columns', 15) 
+pd.set_option('display.max_rows', 15) 
+
+df1 = wb.download(indicator = indicators, country = countries,  start = 2008, end = 2018)
+date_period = [i for i in range(2008, 2019)]
+print(df1)
+
+# create a new DataFrame df2 for later use, not change origin values from df1 if we do some calculations for our dataframe df2
+# rename the columns name
+df2 = df1.rename(columns = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
+             'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
+             'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}, inplace = False)
+
+# overview our data object DataFrame
+# Data manipulation: dealing with the missing value, replace them as mean(), which has less impact on our data sets
+df2.mean()
+df2.fillna(df2.mean(), inplace = True)
+print(df2)
+
+# Overview our new edited DataFram and get basic info of statistics
+print(df2.describe())
+
+
+
+
+# =============================================================================
+# # Part 2: Data structure set up
+# =============================================================================
+
+# =============================================================================
+# # creating our Data Structure type I
+# =============================================================================
+
+# step I: convert DataFrame to a list in correct order from 2008 to 2018
+def country_DataFrame_to_list(country, target_data):
+  df = wb.download(indicator = target_data, country = country,  start = 2008, end = 2018)
+  df.fillna(df.mean(), inplace = True)
+  df_list =df[df.columns[0]].tolist()
+  round_list = [round(i, 2) for i in df_list ]
+  return round_list[::-1]
+
+# step II: make a list of tuple, which is a good way to save our data
+def country_tuples(country_list, time):
+  return list(zip(country_list, time))
+
+# additional gap calculation for calculating the gap between two list
+def gap_between(toplist, lowlist):
+  gap_list = []
+  for i in range(len(toplist)):
+    gap_list.append(round((toplist[i]- lowlist[i]), 2))
+  return gap_list
+
+
+
+# step IV: Make a dictionary of list of tuple, which is one of our data structure of this project,
+# named as Data Structure type I.
+def object_Dictionary(country_list, object_target, date_period):
+  object_df = {}
+  for country in country_list:
+    object_df[country] = country_tuples(date_period, country_DataFrame_to_list(country, object_target))
+  return object_df
+
+# step V: start to build: 
+    
+    
+# This data set is for storing data of Income share held by highest 20%
+Top_20_df = object_Dictionary(countries, 'SI.DST.05TH.20', date_period)
+
+# This data set is for storing data of Income share held by lowest 20%
+Low_20_df = object_Dictionary(countries, 'SI.DST.FRST.20', date_period)
+
+# This data set is for storing data of 'Employment to population ratio, 15+, female (%) (national estimate)'
+female_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.FE.NE.ZS', date_period)
+
+# This data set is for storing data of 'Employment to population ratio, 15+, male (%) (national estimate)'
+male_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.MA.NE.ZS', date_period)
+
+
+
+
+# =============================================================================
+# # creating our Data Structure type II: convert our Data structure type I to typle II
+# =============================================================================
+# step 1: write a function that can unpack dictionary of tuple to a new dictionary of simple list, and calculate the gap
+def no_tuple_dic(object_Dictionary1, object_Dictionary2):
+  new_dict = {}
+  for i in countries:
+    new_list = []
+    for j in range(11):
+      # The reason why I didn't use the difference function is because I don't want my new dictionary has year
+      new_list.append(round((object_Dictionary1[i][j][1]- object_Dictionary2[i][j][1]), 2)) 
+    new_dict[i] = new_list  
+
+  return new_dict
+
+# step 2: getting the income gap dictionary of list between income share held by highest 20% and income share held by lowest 20%
+income_gap_dict = no_tuple_dic(Top_20_df, Low_20_df)
+
+# step 3: create our Data structure type II, DataFrame
+income_gap_dict_df = pd.DataFrame(income_gap_dict, columns = countries)
+
+# step 4: show the basic statistic info of our income gap DataFrame
+print(round(income_gap_dict_df.describe(),2))
+
+# same step as above, to get our Data Structure type II, between male employment population and female employment population
+gender_gap_dict = no_tuple_dic(male_employ_df, female_employ_df)
+
+gender_gap_dict_df = pd.DataFrame(gender_gap_dict, columns = countries)
+print(round(gender_gap_dict_df.describe(),2))
+
+
+
+# Data Structure function application
+
+# This function is to calculate the difference of the gap between income share held by highest 20% and income share held by lowest 20%
+def gap_income_Dataframe(country):
+  gap = {}
+  for i in range(len(Top_20_df[country])):
+    year1, data1 = Top_20_df[country][i]
+    year2, data2 = Low_20_df[country][i]  
+    if year1 == year2:
+      gap[year1] = round(data1-data2, 2)
+  return gap
+
+# This function is to calculate the difference of the gap between male employment population and female employment population
+def gap_gender_Dataframe(country):
+  gap = {}
+  for i in range(len(Top_20_df[country])):
+    year1, data1 = male_employ_df[country][i]
+    year2, data2 = female_employ_df[country][i]  
+    if year1 == year2:
+      gap[year1] = round(data1-data2, 2)
+  return gap
+
+# This function is to searching specific country and year  
+def searching_data(object_Dictionary, country, year):
+  country_list = []
+  if country in countries:
+    for i in range(11):
+      country_list.append(object_Dictionary[country][i])
+  
+  output = [item for item in country_list if item[0] == year]
+  #return empty list if data not found, return a tuple if country and year is valid    
+  return output
+
+
+
+
+
+# =============================================================================
+# # Part 3: Ploting the data set
+# =============================================================================
+
+
+# =============================================================================
+# #plot 1: Income gap from 2008 to 2018
+# =============================================================================
+
+from matplotlib.pyplot import MultipleLocator
+plt.title('Income gap from 2008 to 2018')
+plt.xlabel('Year')
+plt.ylabel('Income gap%')
+all_data_i = []
+
+for c in countries:
+  gap_i = gap_income_Dataframe(c)
+  x_i = gap_i.keys()
+  y_i = gap_i.values()
+  all_data_i.append(gap_i)
+  plt.scatter(x_i,y_i,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 2
+plt.xlim(2007,2019)   #Set the x scale range of the x-axis from 2008 to 2018, the reason why I use 2019 is because we can see clearly t 
+plt.ylim(25,60)     #Set the y scale range of the y-axis from 25 to 60
+
+N = 10000
+xr_i = list(range(2008,2019))
+yr_i = []
+for i in xr_i:
+  temp = 0
+  for j in all_data_i:
+    temp += j[i]
+  temp /= len(countries)
+  yr_i.append(temp)
+plt.plot(xr_i,yr_i,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.savefig('Income gap.pdf')  
+plt.show()
+
+# =============================================================================
+# #plot 2: Gender Employment rate gap from 2008 to 2018
+# =============================================================================
+
+plt.title('Gender Employment rate gap from 2008 to 2018')
+plt.xlabel('Year')
+plt.ylabel('Gender Employment Gap %')
+all_data_j = []
+for c in countries:
+  gap_j = gap_gender_Dataframe(c)
+  x_j = gap_j.keys()
+  y_j = gap_j.values()
+  all_data_j.append(gap_j)
+  plt.scatter(x_j,y_j,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 0.02
+plt.xlim(2007,2019)   #Set the scale range of the x-axis from 2008 to 2018
+plt.ylim(6,38)     #Set the scale range of the y-axis from 25 to 60
+
+N = 10000
+xr_j = list(range(2008,2019))
+yr_j = []
+for i in xr_j:
+  temp = 0
+  for j in all_data_j:
+    temp += j[i]
+  temp /= len(countries)
+  yr_j.append(temp)
+plt.plot(xr_j,yr_j,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.show()
+
+# =============================================================================
+# #boxplot 1 income gap
+# =============================================================================
+
+plt.figure(figsize=(9,6),dpi=60)
+
+labels, data = [*zip(*income_gap_dict.items())]  # 'transpose' items to parallel key, value lists
+
+# or backwards compatable    
+labels, data = income_gap_dict.keys(), income_gap_dict.values()
+plt.title('Income Gap from 2008 to 2018')
+plt.xlabel('Country')
+plt.ylabel('Income Gap %')
+plt.boxplot(data)
+plt.xticks(range(1, len(labels) + 1), labels)
+plt.show()
+
+# =============================================================================
+# #boxplot 2 gender employment gap
+# =============================================================================
+
+plt.figure(figsize=(9,6),dpi=60)
+
+labels, data = [*zip(*gender_gap_dict.items())]  # 'transpose' items to parallel key, value lists
+
+# or backwards compatable    
+labels, data = gender_gap_dict.keys(), gender_gap_dict.values()
+plt.title('Gender Employment Gap')
+plt.xlabel('Country')
+plt.ylabel('Gender Employment Gap %')
+plt.boxplot(data)
+plt.xticks(range(1, len(labels) + 1), labels)
+plt.show()
+
+# =============================================================================
+# #Part 4: linear regression
+# =============================================================================
+
+import numpy as np
+from sklearn.linear_model import LinearRegression
+
+# Convert the original data frame to list
+def convert_to_target_data_dict(country_list):
+    converted_dict = {}
+
+    for i in range(len(country_list)):
+        country_name = country_list[i]
+        converted_dict[country_name] = {}
+        gap_income_dict = gap_income_Dataframe(country_name)
+        gap_gender_dict = gap_gender_Dataframe(country_name)
+        converted_gap_income_list = []
+        converted_gap_gender_list = []
+
+        for k in gap_income_dict:
+            converted_gap_income_list.append(gap_income_dict[k])
+            converted_gap_gender_list.append(gap_gender_dict[k])
+
+        converted_dict[country_name]["income"] = converted_gap_income_list
+        converted_dict[country_name]["gender"] = converted_gap_gender_list
+
+    return converted_dict
+
+
+# Work out the x-coordinates for linear regression
+def x_coordinate():
+    x_list = []
+    x_coordinate = 2008
+    for i in range(11):
+        x_list.append(x_coordinate)
+        x_coordinate = x_coordinate + 1
+
+    return x_list
+
+
+# Work out the linear regression for single country
+def linear_regression(contry_name, coordinate_dict, data_type, predict_time):
+    y_list = coordinate_dict[contry_name][data_type]
+    x_list = x_coordinate()
+    x = np.array(x_list).reshape((-1, 1))
+    y = np.array(y_list)
+
+    linear_model = LinearRegression().fit(x, y)
+
+    predict_year = np.array([predict_time]).reshape((-1, 1))
+    ten_year_prediction = linear_model.predict(predict_year)
+    
+
+    return ten_year_prediction[0]
+
+
+# Work out the final predicted result for the income and gender gap of 2030
+def total_linear_regression_result(y_coordinate_dict):
+    linear_regression_result_dict = {}
+
+    for k in y_coordinate_dict:
+        linear_regression_result_dict[k] = {}
+        predict_income_gap_2030 = linear_regression(k, y_coordinate_dict, "income", 2030)
+        predict_gender_gap_2030 = linear_regression(k, y_coordinate_dict, "gender", 2030)
+        linear_regression_result_dict[k]["income"] = predict_income_gap_2030
+        linear_regression_result_dict[k]["gender"] = predict_gender_gap_2030
+
+    return linear_regression_result_dict
+
+
+# Calculate the average income & gender gap of 2030
+def calculate_average_gap(result_dict, country_list):
+    average_result_dict = {}
+    sum_income_gap = 0
+    sum_gender_gap = 0
+
+    for k in result_dict:
+        sum_income_gap = sum_income_gap + result_dict[k]["income"]
+        sum_gender_gap = sum_gender_gap + result_dict[k]["gender"]
+
+    average_income_gap = sum_income_gap / len(country_list)
+    average_gender_gap = sum_gender_gap / len(country_list)
+
+    average_result_dict["average_income_gap"] = average_income_gap
+    average_result_dict["average_gender_gap"] = average_gender_gap
+
+    return average_result_dict
+
+
+# Compare the average value with our liner regression result
+# print the list of countries which higher or lower than our average prediction, or even equal
+def compare_with_the_average(average_dict, result_dict):
+    compare_result_dict = {}
+    higher_than_income_average = []
+    lower_than_income_average = []
+    equal_to_income_average = []
+    higher_than_gender_average = []
+    lower_than_gender_average = []
+    equal_to_gender_average = []
+
+    for k in result_dict:
+        if result_dict[k]["income"] > average_dict["average_income_gap"]:
+            higher_than_income_average.append(k)
+        elif result_dict[k]["income"] < average_dict["average_income_gap"]:
+            lower_than_income_average.append(k)
+        elif result_dict[k]["income"] == average_dict["average_income_gap"]:
+            equal_to_income_average.append(k)
+
+        if result_dict[k]["gender"] > average_dict["average_gender_gap"]:
+            higher_than_gender_average.append(k)
+        elif result_dict[k]["gender"] < average_dict["average_gender_gap"]:
+            lower_than_gender_average.append(k)
+        elif result_dict[k]["gender"] == average_dict["average_gender_gap"]:
+            equal_to_gender_average.append(k)
+
+    compare_result_dict["higher_than_income_average"] = higher_than_income_average
+    compare_result_dict["lower_than_income_average"] = lower_than_income_average
+    compare_result_dict["equal_to_income_average"] = equal_to_income_average
+
+    compare_result_dict["higher_than_gender_average"] = higher_than_gender_average
+    compare_result_dict["lower_than_gender_average"] = lower_than_gender_average
+    compare_result_dict["equal_to_gender_average"] = equal_to_gender_average
+
+    return compare_result_dict
+
+
+def main():
+    # Work out the linear regression result for the 'countries' list
+    y_dict = convert_to_target_data_dict(countries)
+    linear_regression_result_dict = total_linear_regression_result(y_dict)
+
+    # Work out the average income & gender gap
+    average_gap_result = calculate_average_gap(linear_regression_result_dict, countries)
+
+    # Compare the average gap with the gap for each country
+    compare_with_average = compare_with_the_average(average_gap_result, linear_regression_result_dict)
+
+    # Print the results
+    print(linear_regression_result_dict)
+    print()
+    print(average_gap_result)
+    print()
+    print(compare_with_average)
+    return linear_regression_result_dict,average_gap_result,compare_with_average
+
+
+if __name__ == "__main__":
+    linear_regression_result_dict,average_gap_result,compare_with_average = main()
+
+
+# over view our linear regression result
+print()
+print(linear_regression_result_dict)
+
+
+# =============================================================================
+# #part 5: plot the figure with our prediction with comparison
+# =============================================================================
+
+# Commented out IPython magic to ensure Python compatibility.
+# =============================================================================
+# #plot 1 for income gap with prediction in 2030
+# =============================================================================
+# %matplotlib inline
+from matplotlib.pyplot import MultipleLocator
+plt.figure(figsize=(12,6),dpi=60)
+plt.title('Prediction of Income Gap in 2030')
+plt.xlabel('Year')
+plt.ylabel('Income gap%')
+all_data_i = []
+
+xr_x = list(range(2008,2019))
+xr_x.append(2030)
+# xr_x = list(map(lambda x:str(x),xr_x))
+for c in countries:
+  gap_i = gap_income_Dataframe(c)
+  x_i = list(gap_i.keys())
+  y_i = list(gap_i.values())
+  tmp = linear_regression_result_dict[c]
+  x_i.append(2019)
+  y_i.append(tmp["income"])
+  gap_i[2019] = tmp["income"]
+  all_data_i.append(gap_i)
+  plt.scatter(xr_x,y_i,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 2
+plt.xlim(2007,2031)   #Set the scale range of the x-axis from 2008 to 2018
+plt.ylim(25,60)     #Set the scale range of the y-axis from 25 to 60
+
+
+xr_i = list(range(2008,2019))
+xr_i.append(2019)
+yr_i = []
+for i in xr_i:
+  temp = 0
+  for j in all_data_i:
+    temp += j[i]
+  temp /= len(countries)
+  yr_i.append(temp)
+
+plt.plot(xr_x,yr_i,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.savefig('Income gap.pdf')  
+plt.show()
+
+
+
+
+# =============================================================================
+# #plot 2 for gender gap with prediction in 2030
+# =============================================================================
+plt.figure(figsize=(12,6),dpi=60)
+plt.title('Prediction of Gender Employment Gap in 2030')
+plt.xlabel('Year')
+plt.ylabel('Gender Employment Gap %')
+all_data_j = []
+
+xr_x = list(range(2008,2019))
+xr_x.append(2030)
+for c in countries:
+  gap_j = gap_gender_Dataframe(c)
+  x_j = list(gap_j.keys())
+  y_j = list(gap_j.values())
+  tmp = linear_regression_result_dict[c]
+  x_j.append(2019)
+  y_j.append(tmp["gender"])
+  gap_j[2019] = tmp["gender"]
+  all_data_j.append(gap_j)
+  plt.scatter(xr_x,y_j,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 0.02
+plt.xlim(2007,2031)   #Set the scale range of the x-axis from 2008 to 2018
+plt.ylim(2,38)     #Set the scale range of the y-axis from 25 to 60
+
+
+xr_j = list(range(2008,2019))
+xr_j.append(2019)
+yr_j = []
+for i in xr_j:
+  temp = 0
+  for j in all_data_j:
+    temp += j[i]
+  temp /= len(countries)
+  yr_j.append(temp)
+plt.plot(xr_x,yr_j,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.show()
+
+

+ 338 - 0
BI/macro_analysis-backup.py

@@ -0,0 +1,338 @@
+import pandas as pd
+from bokeh.plotting import figure, save, show,output_file, ColumnDataSource
+from bokeh.models import HoverTool
+import matplotlib.pyplot as plt
+
+class DataFrameAnalysis:
+    """Arms Macro-Analysis capability to a dataframe"""
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+
+    def avg_discount_rate(self):
+        """Calculates average discount rate of all orders."""
+        # You should calculate the average and gross discount rate.
+        pd.to_numeric(self.df['Discount_Amount'])
+        pd.to_numeric(self.df['Order_Total_Amount'])
+        total_sales_amount = self.df['Order_Total_Amount'].sum()
+        total_discount_amount = self.df['Discount_Amount'].sum()
+        total_discount_avg = int((total_discount_amount / (total_discount_amount+total_sales_amount))*100)
+        return print(f'Customer Discount Avg: {total_discount_avg}%')
+
+
+    def customer_role_breakdown(self):
+        """Calculates proportion of retail/wholesale as a function of sales."""
+        retail = 0
+        wholesale = 0
+        sum_count =int(len(self.df.index))
+        sum_sales = self.df['Order_Total_Amount'].sum()
+        retail_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Customer'].index)/sum_count)*100)
+        wholesale_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Wholesale Customer'].index)/sum_count)*100)
+        retail_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Customer'].sum()/sum_sales)*100)
+        wholesale_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Wholesale Customer'].sum()/sum_sales)*100)
+        grid = [[retail_customer_count,wholesale_customer_count],[retail_sales,wholesale_sales]]
+        crb_df = pd.DataFrame(data=grid, columns=['Retail','Wholesale'], index=['Proportional Order Counts', 'Proportional Sales'])
+        plt.style.use('seaborn-deep')
+        fig, ax = plt.subplots(figsize=(10, 10))
+        crb_df.plot.bar(title='Customer Role Breakdown', xlabel='Customer Role', ylabel='Proportion (%)',
+                        cmap='winter', ax=ax)
+        plt.figsave('Customer_Role_Breakdown.png')
+        print(crb_df.head(3))
+
+    def geographical_breakdown(self):
+        """ Displays a scatterplot of Sales/Revenue weights for different States."""
+        self.df = self.df[self.df.Country_Name_Shipping== 'United States (US)']
+        counts = self.df["State_Name_Shipping"].value_counts().to_dict()
+        States = list(counts.keys())
+        Count = list(counts.values())
+        geo = pd.DataFrame({'States': States, 'Counts': Count})
+        geo_dataframe = pd.DataFrame(geo)
+        geo_dataframe.insert(loc=2, column="Sales_Total", value=0)
+        geo_dataframe.insert(loc=3, column="Avg_Purchase_Revenue", value=0)
+        for i, row in self.df.iterrows():
+            state = row.loc['State_Name_Shipping']
+            total = row.loc['Order_Total_Amount']
+            idx = geo_dataframe[geo_dataframe["States"] == state].index.item()
+            av = int(geo_dataframe.at[idx, 'Sales_Total']) / int(geo_dataframe.at[idx, 'Counts'])
+            geo_dataframe.at[idx, 'Sales_Total'] += total
+            geo_dataframe.at[idx, 'Avg_Purchase_Revenue'] = av
+        # data visualization
+        cds = ColumnDataSource(geo_dataframe)
+        cds.data.keys()
+        visual = figure(tools='box_zoom, pan, reset',
+                        width=700, height=700,
+                        title='Geographical Sales Breakdown',
+                        y_axis_label='Order Quantity', x_axis_label='Revenue')
+        visual.circle('Sales_Total', 'Counts', size=7, source=cds, name= 'States')
+        visual.add_tools(HoverTool(tooltips=[("State", "@States"),
+                                             ("Average Purchase Revenue", "@Avg_Purchase_Revenue")
+                                             ]))
+        output_file('geographical_breakdown.html')
+        save(visual)
+        show(visual)
+        return print(geo_dataframe)
+
+
+class ProductAnalysis:
+    """Arms product analysis capability to a dataframe"""
+
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+        self.analysis_frame = self.monthly_product_frame()
+        self.time_span = self.serve_time_span()  # list of tuples: x[0] == year, x[1] == month for x in self.time_span
+
+    def monthly_product_frame(self):
+        """Analyzes the order lines in the CSV_Files folder and
+        Returns a pandas Dataframe with monthly product statistics."""
+        from datetime import datetime
+        import information_repository as ir
+        frame = self.df
+        frame = frame[['Order_Date', 'Product_Name', 'Quantity', 'Item_Cost']]
+        dict_list = []
+        for i, row in frame.iterrows():
+            row_date = row['Order_Date']
+            row_date = datetime.strptime(row_date, "%Y-%m-%d %H:%M")
+            row_date_month = row_date.month
+            row_date_year = row_date.year
+            raw_products = row['Product_Name'].replace('\r', '').split('\n')
+            raw_quantities = row['Quantity'].replace('\r', '').split('\n')
+            raw_cost = row['Item_Cost'].replace('\r', '').split('\n')
+            for key in range(len(raw_products)):
+                product = [i for i in ir.p_list if i in raw_products[key]][0]
+                quantity = int(raw_quantities[key])
+                revenue = float(raw_cost[key])
+                dict_object = [product, quantity, revenue, row_date_month, row_date_year]
+                matched_dictionary = [i for i in dict_list if
+                                      i['name'] == dict_object[0] and i['month'] == dict_object[3]
+                                      and i['year'] == dict_object[4]]
+                if len(matched_dictionary) == 1:
+                    matched_dictionary[0]['count'] += dict_object[1]
+                    matched_dictionary[0]['revenue'] += dict_object[2]
+                else:
+                    dict_list.append({'name': dict_object[0], 'count': dict_object[1],
+                                      'revenue': dict_object[2], 'month': dict_object[3], 'year': dict_object[4]})
+        self.analysis_frame = pd.DataFrame(columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+        time_span = []
+        for product in ir.p_list:
+            product_dictionaries = sorted(
+                sorted([i for i in dict_list if i['name'] == product], key= x['month']
+                       ), key= x['year'])
+            data_list = []
+            year_list = []
+            month_list = []
+            for key in range(len(product_dictionaries)):
+                if key > 0:
+                    try:
+                        change_over_month = (100 - round(
+                            ((product_dictionaries[key]['revenue'] / product_dictionaries[key]['count'])
+                             / (product_dictionaries[key - 1]['revenue'] / product_dictionaries[key - 1][
+                                        'count'])) * 100))
+
+                    except IndexError:
+                        print('change_list calls need to be refined')
+                else:
+                    change_over_month = 0
+
+                row_list = [product_dictionaries[key]['year'], product_dictionaries[key]['month'],
+                            product_dictionaries[key]['count'], product_dictionaries[key]['revenue'], change_over_month,
+                            product_dictionaries[key]['name']]
+                data_list.append(row_list)
+                if product == 'Blue Moon':
+                    month_list.append(product_dictionaries[key]['month'])
+                    year_list.append(product_dictionaries[key]['year'])
+
+            if product == 'Blue Moon':
+                time_span = [*zip(year_list, month_list)]
+            append_frame = pd.DataFrame(data=data_list,
+                                        columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+            self.analysis_frame = pd.concat([self.analysis_frame, append_frame], ignore_index=True)
+        self.time_span = time_span
+        return self.analysis_frame
+
+    def highest_positive_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level increased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=False)
+        return print(data_slice.head(5))
+
+    def highest_negative_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level decreased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=True)
+        return data_slice
+
+    def product_line_change_over_month_analysis(self, year, month):
+        """Analyzes the monthly_product_frame by product line and returns a dataframe with
+        product line change over month data."""
+        import information_repository as ir
+        #year = int(input('Type the year you would like to query in yyyy format:  '))
+        #month = int(input('Type the month you would like to query:  '))
+        product_line_list_of_lists = [ir.tea_product_list, ir.capsule_product_list, ir.smokeable_product_list,
+                             ir.skincare_product_list, ir.superfood_product_list, ir.honey_product_list,
+                             ir.tincture_product_list]
+        product_line_strings = ['Tea', 'Capsules', 'Smokeables', 'Skincare', 'Superfood', 'Honey', 'Tinctures']
+        product_line_append_list = []
+        line_index_counter = 0
+        for product_line in product_line_list_of_lists:
+            line_list = []
+            line_list.append(year)
+            line_list.append(month)
+            data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[
+                self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            if month > 1:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == (month - 1)].loc[
+                    self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            else:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == 12].loc[
+                    self.analysis_frame['year'] == (year - 1)].loc[self.analysis_frame['product'].isin(product_line)]
+            last_month_revenue = last_month_frame['revenue'].sum()
+            this_month_revenue = data_slice['revenue'].sum()
+            avg_change_over_month = (this_month_revenue / last_month_revenue) * 100
+            line_list.append(avg_change_over_month)
+            product_line = product_line_strings[line_index_counter]
+            line_index_counter += 1
+            line_list.append(product_line)
+            product_line_append_list.append(line_list)
+        product_line_analysis_frame = pd.DataFrame(data=product_line_append_list,
+                                                   columns=['year', 'month', 'avg_change_over_month',
+                                                            'product_line'])
+        product_line_analysis_frame.to_csv('product_line_csv_2021.csv')
+        return product_line_analysis_frame
+
+    def serve_time_span(self):
+        """Returns a list of tuples of unique (year, month) pairs in chronological order based on the
+         monthly_product_frame."""
+        return sorted(sorted(list(set([*zip(self.analysis_frame['year'],self.analysis_frame['month'])])),
+                            key=x[1]), key=x[0])
+
+    def product_line_change_over_month_graph(self):
+        """Using the product_line_change_over_month_analysis frame, it outputs a graph of the changes over time for
+        the top product lines."""
+        line_change_frame_data = []
+        for i in self.time_span:
+            month_frame = self.product_line_change_over_month_analysis(i[0], i[1])
+            change_list = month_frame['avg_change_over_month']
+            line_change_frame_data.append(change_list)
+        treated_line_change_frame_data = []
+        for i in range(len(line_change_frame_data)): #index of time period/segment
+            if i ==0:
+                treated_line_change_frame_data.append([self.time_span[i][0], self.time_span[i][1],
+                                                       0,0,0,0,0,0,0]) #insert base amounts for the first month
+            else: #function as intended
+                month_cumulative_change_list = []
+                month_cumulative_change_list.append(self.time_span[i][0])
+                month_cumulative_change_list.append(self.time_span[i][1])# append year and month
+                for x in range(len(line_change_frame_data[0])):
+                    prior_change_list = [i[x] for i in line_change_frame_data]
+                    product_cumulative_change = (100+treated_line_change_frame_data[i-1][x+2]) * ((prior_change_list[i]/100))-100
+                    #i-1 for previous time period and x+2 for offset due to year and month category
+                    month_cumulative_change_list.append(product_cumulative_change)
+                treated_line_change_frame_data.append(month_cumulative_change_list)
+        graph_frame = pd.DataFrame(data=treated_line_change_frame_data, columns=['Year', 'Month', 'Tea', 'Capsules', 'Smokeables','Skincare',
+                                                                           'Superfood', 'Honey', 'Tinctures'])
+        print(graph_frame.head(7))
+        x = [str(i) for i in graph_frame['Month']]
+        y1 = graph_frame['Tea']
+        y2 = graph_frame['Capsules']
+        y3 = graph_frame['Superfood']
+        y4 = graph_frame['Honey']
+        y5 = graph_frame['Smokeables']
+        graph = figure(x_range=x,title='Cumulative Percentage Change of Product Lines',x_axis_label='Month', y_axis_label='Percentage Change')
+        graph.line(x, y1, legend_label ='Tea', color='red', line_width=3)
+        graph.line(x, y2, legend_label ='Capsules', color='blue', line_width=3)
+        graph.line(x, y3, legend_label ='Superfood', color='orange', line_width=3)
+        graph.line(x, y4, legend_label ='Honey', color='yellow', line_width=3)
+        graph.line(x, y5, legend_label ='Smokeables', color='green', line_width=3)
+        output_file('product_line_change_over_month.html')
+        save(graph)
+        return show(graph)
+
+
+class InventoryPredictor:
+    """Inventory volume prediction using a product sales csv as the raw data."""
+    def __init__(self):
+        import information_repository as ir
+        self.unit_counts = self.sales_unit_count_dictionaries()
+        self.ingredients = self.ingredient_dictionary()
+        self.recipes = ir.unit_recipes
+
+        print('initiating')
+        pass
+
+    def sales_unit_count_dictionaries(self):
+        """Creates a set of dictionaries for each product and the cumulative quantity of units across all SKUs."""
+        import information_repository as ir
+        product_sales_frame = pd.read_csv('product_sales.csv')
+        product_sales_frame = product_sales_frame.where(pd.notnull(product_sales_frame), 'None')
+        product_unit_amounts = []
+        for i in ir.p_list:
+            product_dict = dict(name=i, quantity=0)
+            for x, row in product_sales_frame.iterrows():
+                if i in row['Product Name']:
+                    if i in ir.tea_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '20' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 20
+                        else:
+                            pass
+                            # print('Something unexpected occured', row['Product Name'], row['Variation Attributes'])
+                    elif i in ir.superfood_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '9' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        else:
+                            product_dict['quantity'] += 1
+                    elif i in ir.capsule_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        if '4' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.smokeable_product_list:
+                        if '7' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 7
+                        elif 'prerolls' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 2
+                        else:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.honey_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '5' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 5
+                        elif '2' in row['Variation Attributes']:
+                            pass
+                            # print('Reminder that packet honeys and jars need to separate')
+                    else:
+                        product_dict['quantity'] += row['Quantity Sold']
+            product_unit_amounts.append(product_dict)
+        return product_unit_amounts
+
+    def ingredient_dictionary(self):
+        """Creates a ingredient dictionary with all ingredients as keys and the cumulative volume across all
+        products as values."""
+        inventory = pd.read_csv('craftybase-export-material.csv')
+        ingredient_dictionary = {}
+        for i in list(inventory['name']):
+            ingredient_dictionary[i]=0
+        return ingredient_dictionary
+
+    def ingredient_volume_table(self):
+        """Creates a csv with ingredients and the cumulative volume used across a time span."""
+        for x in self.unit_counts:
+            for y in self.recipes:
+                if x['name'] == y['name']:
+                    for k, v in y.items():
+                        if k != 'name':
+                            self.ingredients[k] += v * x['quantity']
+        sorted_ingredient_volumes = sorted(self.ingredients.items(), key= x[1], reverse=True)
+        output_frame = pd.DataFrame(data = sorted_ingredient_volumes, columns= ['Ingredient', 'Volume (gram or oz)'])
+        output_frame = output_frame[output_frame['Volume (gram or oz)'] !=0]
+        output_frame.to_csv('ingredient_volume_table.csv')
+

+ 338 - 0
BI/macro_analysis.py

@@ -0,0 +1,338 @@
+import pandas as pd
+from bokeh.plotting import figure, save, show,output_file, ColumnDataSource
+from bokeh.models import HoverTool
+import matplotlib.pyplot as plt
+
+class DataFrameAnalysis:
+    """Arms Macro-Analysis capability to a dataframe"""
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+
+    def avg_discount_rate(self):
+        """Calculates average discount rate of all orders."""
+        # You should calculate the average and gross discount rate.
+        pd.to_numeric(self.df['Discount_Amount'])
+        pd.to_numeric(self.df['Order_Total_Amount'])
+        total_sales_amount = self.df['Order_Total_Amount'].sum()
+        total_discount_amount = self.df['Discount_Amount'].sum()
+        total_discount_avg = int((total_discount_amount / (total_discount_amount+total_sales_amount))*100)
+        return print(f'Customer Discount Avg: {total_discount_avg}%')
+
+
+    def customer_role_breakdown(self):
+        """Calculates proportion of retail/wholesale as a function of sales."""
+        retail = 0
+        wholesale = 0
+        sum_count =int(len(self.df.index))
+        sum_sales = self.df['Order_Total_Amount'].sum()
+        retail_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Customer'].index)/sum_count)*100)
+        wholesale_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Wholesale Customer'].index)/sum_count)*100)
+        retail_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Customer'].sum()/sum_sales)*100)
+        wholesale_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Wholesale Customer'].sum()/sum_sales)*100)
+        grid = [[retail_customer_count,wholesale_customer_count],[retail_sales,wholesale_sales]]
+        crb_df = pd.DataFrame(data=grid, columns=['Retail','Wholesale'], index=['Proportional Order Counts', 'Proportional Sales'])
+        plt.style.use('seaborn-deep')
+        fig, ax = plt.subplots(figsize=(10, 10))
+        crb_df.plot.bar(title='Customer Role Breakdown', xlabel='Customer Role', ylabel='Proportion (%)',
+                        cmap='winter', ax=ax)
+        plt.figsave('Customer_Role_Breakdown.png')
+        print(crb_df.head(3))
+
+    def geographical_breakdown(self):
+        """ Displays a scatterplot of Sales/Revenue weights for different States."""
+        self.df = self.df[self.df.Country_Name_Shipping== 'United States (US)']
+        counts = self.df["State_Name_Shipping"].value_counts().to_dict()
+        States = list(counts.keys())
+        Count = list(counts.values())
+        geo = pd.DataFrame({'States': States, 'Counts': Count})
+        geo_dataframe = pd.DataFrame(geo)
+        geo_dataframe.insert(loc=2, column="Sales_Total", value=0)
+        geo_dataframe.insert(loc=3, column="Avg_Purchase_Revenue", value=0)
+        for i, row in self.df.iterrows():
+            state = row.loc['State_Name_Shipping']
+            total = row.loc['Order_Total_Amount']
+            idx = geo_dataframe[geo_dataframe["States"] == state].index.item()
+            av = int(geo_dataframe.at[idx, 'Sales_Total']) / int(geo_dataframe.at[idx, 'Counts'])
+            geo_dataframe.at[idx, 'Sales_Total'] += total
+            geo_dataframe.at[idx, 'Avg_Purchase_Revenue'] = av
+        # data visualization
+        cds = ColumnDataSource(geo_dataframe)
+        cds.data.keys()
+        visual = figure(tools='box_zoom, pan, reset',
+                        width=700, height=700,
+                        title='Geographical Sales Breakdown',
+                        y_axis_label='Order Quantity', x_axis_label='Revenue')
+        visual.circle('Sales_Total', 'Counts', size=7, source=cds, name= 'States')
+        visual.add_tools(HoverTool(tooltips=[("State", "@States"),
+                                             ("Average Purchase Revenue", "@Avg_Purchase_Revenue")
+                                             ]))
+        output_file('geographical_breakdown.html')
+        save(visual)
+        show(visual)
+        return print(geo_dataframe)
+
+
+class ProductAnalysis:
+    """Arms product analysis capability to a dataframe"""
+
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+        self.analysis_frame = self.monthly_product_frame()
+        self.time_span = self.serve_time_span()  # list of tuples: x[0] == year, x[1] == month for x in self.time_span
+
+    def monthly_product_frame(self):
+        """Analyzes the order lines in the CSV_Files folder and
+        Returns a pandas Dataframe with monthly product statistics."""
+        from datetime import datetime
+        import information_repository as ir
+        frame = self.df
+        frame = frame[['Order_Date', 'Product_Name', 'Quantity', 'Item_Cost']]
+        dict_list = []
+        for i, row in frame.iterrows():
+            row_date = row['Order_Date']
+            row_date = datetime.strptime(row_date, "%Y-%m-%d %H:%M")
+            row_date_month = row_date.month
+            row_date_year = row_date.year
+            raw_products = row['Product_Name'].replace('\r', '').split('\n')
+            raw_quantities = row['Quantity'].replace('\r', '').split('\n')
+            raw_cost = row['Item_Cost'].replace('\r', '').split('\n')
+            for key in range(len(raw_products)):
+                product = [i for i in ir.p_list if i in raw_products[key]][0]
+                quantity = int(raw_quantities[key])
+                revenue = float(raw_cost[key])
+                dict_object = [product, quantity, revenue, row_date_month, row_date_year]
+                matched_dictionary = [i for i in dict_list if
+                                      i['name'] == dict_object[0] and i['month'] == dict_object[3]
+                                      and i['year'] == dict_object[4]]
+                if len(matched_dictionary) == 1:
+                    matched_dictionary[0]['count'] += dict_object[1]
+                    matched_dictionary[0]['revenue'] += dict_object[2]
+                else:
+                    dict_list.append({'name': dict_object[0], 'count': dict_object[1],
+                                      'revenue': dict_object[2], 'month': dict_object[3], 'year': dict_object[4]})
+        self.analysis_frame = pd.DataFrame(columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+        time_span = []
+        for product in ir.p_list:
+            product_dictionaries = sorted(
+                sorted([i for i in dict_list if i['name'] == product], key=lambda x: x['month']
+                       ), key=lambda x: x['year'])
+            data_list = []
+            year_list = []
+            month_list = []
+            for key in range(len(product_dictionaries)):
+                if key > 0:
+                    try:
+                        change_over_month = (100 - round(
+                            ((product_dictionaries[key]['revenue'] / product_dictionaries[key]['count'])
+                             / (product_dictionaries[key - 1]['revenue'] / product_dictionaries[key - 1][
+                                        'count'])) * 100))
+
+                    except IndexError:
+                        print('change_list calls need to be refined')
+                else:
+                    change_over_month = 0
+
+                row_list = [product_dictionaries[key]['year'], product_dictionaries[key]['month'],
+                            product_dictionaries[key]['count'], product_dictionaries[key]['revenue'], change_over_month,
+                            product_dictionaries[key]['name']]
+                data_list.append(row_list)
+                if product == 'Blue Moon':
+                    month_list.append(product_dictionaries[key]['month'])
+                    year_list.append(product_dictionaries[key]['year'])
+
+            if product == 'Blue Moon':
+                time_span = [*zip(year_list, month_list)]
+            append_frame = pd.DataFrame(data=data_list,
+                                        columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+            self.analysis_frame = pd.concat([self.analysis_frame, append_frame], ignore_index=True)
+        self.time_span = time_span
+        return self.analysis_frame
+
+    def highest_positive_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level increased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=False)
+        return print(data_slice.head(5))
+
+    def highest_negative_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level decreased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=True)
+        return data_slice
+
+    def product_line_change_over_month_analysis(self, year, month):
+        """Analyzes the monthly_product_frame by product line and returns a dataframe with
+        product line change over month data."""
+        import information_repository as ir
+        #year = int(input('Type the year you would like to query in yyyy format:  '))
+        #month = int(input('Type the month you would like to query:  '))
+        product_line_list_of_lists = [ir.tea_product_list, ir.capsule_product_list, ir.smokeable_product_list,
+                             ir.skincare_product_list, ir.superfood_product_list, ir.honey_product_list,
+                             ir.tincture_product_list]
+        product_line_strings = ['Tea', 'Capsules', 'Smokeables', 'Skincare', 'Superfood', 'Honey', 'Tinctures']
+        product_line_append_list = []
+        line_index_counter = 0
+        for product_line in product_line_list_of_lists:
+            line_list = []
+            line_list.append(year)
+            line_list.append(month)
+            data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[
+                self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            if month > 1:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == (month - 1)].loc[
+                    self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            else:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == 12].loc[
+                    self.analysis_frame['year'] == (year - 1)].loc[self.analysis_frame['product'].isin(product_line)]
+            last_month_revenue = last_month_frame['revenue'].sum()
+            this_month_revenue = data_slice['revenue'].sum()
+            avg_change_over_month = (this_month_revenue / last_month_revenue) * 100
+            line_list.append(avg_change_over_month)
+            product_line = product_line_strings[line_index_counter]
+            line_index_counter += 1
+            line_list.append(product_line)
+            product_line_append_list.append(line_list)
+        product_line_analysis_frame = pd.DataFrame(data=product_line_append_list,
+                                                   columns=['year', 'month', 'avg_change_over_month',
+                                                            'product_line'])
+        product_line_analysis_frame.to_csv('product_line_csv_2021.csv')
+        return product_line_analysis_frame
+
+    def serve_time_span(self):
+        """Returns a list of tuples of unique (year, month) pairs in chronological order based on the
+         monthly_product_frame."""
+        return sorted(sorted(list(set([*zip(self.analysis_frame['year'],self.analysis_frame['month'])])),
+                            key=lambda x:x[1]), key=lambda x:x[0])
+
+    def product_line_change_over_month_graph(self):
+        """Using the product_line_change_over_month_analysis frame, it outputs a graph of the changes over time for
+        the top product lines."""
+        line_change_frame_data = []
+        for i in self.time_span:
+            month_frame = self.product_line_change_over_month_analysis(i[0], i[1])
+            change_list = month_frame['avg_change_over_month']
+            line_change_frame_data.append(change_list)
+        treated_line_change_frame_data = []
+        for i in range(len(line_change_frame_data)): #index of time period/segment
+            if i ==0:
+                treated_line_change_frame_data.append([self.time_span[i][0], self.time_span[i][1],
+                                                       0,0,0,0,0,0,0]) #insert base amounts for the first month
+            else: #function as intended
+                month_cumulative_change_list = []
+                month_cumulative_change_list.append(self.time_span[i][0])
+                month_cumulative_change_list.append(self.time_span[i][1])# append year and month
+                for x in range(len(line_change_frame_data[0])):
+                    prior_change_list = [i[x] for i in line_change_frame_data]
+                    product_cumulative_change = (100+treated_line_change_frame_data[i-1][x+2]) * ((prior_change_list[i]/100))-100
+                    #i-1 for previous time period and x+2 for offset due to year and month category
+                    month_cumulative_change_list.append(product_cumulative_change)
+                treated_line_change_frame_data.append(month_cumulative_change_list)
+        graph_frame = pd.DataFrame(data=treated_line_change_frame_data, columns=['Year', 'Month', 'Tea', 'Capsules', 'Smokeables','Skincare',
+                                                                           'Superfood', 'Honey', 'Tinctures'])
+        print(graph_frame.head(7))
+        x = [str(i) for i in graph_frame['Month']]
+        y1 = graph_frame['Tea']
+        y2 = graph_frame['Capsules']
+        y3 = graph_frame['Superfood']
+        y4 = graph_frame['Honey']
+        y5 = graph_frame['Smokeables']
+        graph = figure(x_range=x,title='Cumulative Percentage Change of Product Lines',x_axis_label='Month', y_axis_label='Percentage Change')
+        graph.line(x, y1, legend_label ='Tea', color='red', line_width=3)
+        graph.line(x, y2, legend_label ='Capsules', color='blue', line_width=3)
+        graph.line(x, y3, legend_label ='Superfood', color='orange', line_width=3)
+        graph.line(x, y4, legend_label ='Honey', color='yellow', line_width=3)
+        graph.line(x, y5, legend_label ='Smokeables', color='green', line_width=3)
+        output_file('product_line_change_over_month.html')
+        save(graph)
+        return show(graph)
+
+
+class InventoryPredictor:
+    """Inventory volume prediction using a product sales csv as the raw data."""
+    def __init__(self):
+        import information_repository as ir
+        self.unit_counts = self.sales_unit_count_dictionaries()
+        self.ingredients = self.ingredient_dictionary()
+        self.recipes = ir.unit_recipes
+
+        print('initiating')
+        pass
+
+    def sales_unit_count_dictionaries(self):
+        """Creates a set of dictionaries for each product and the cumulative quantity of units across all SKUs."""
+        import information_repository as ir
+        product_sales_frame = pd.read_csv('product_sales.csv')
+        product_sales_frame = product_sales_frame.where(pd.notnull(product_sales_frame), 'None')
+        product_unit_amounts = []
+        for i in ir.p_list:
+            product_dict = dict(name=i, quantity=0)
+            for x, row in product_sales_frame.iterrows():
+                if i in row['Product Name']:
+                    if i in ir.tea_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '20' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 20
+                        else:
+                            pass
+                            # print('Something unexpected occured', row['Product Name'], row['Variation Attributes'])
+                    elif i in ir.superfood_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '9' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        else:
+                            product_dict['quantity'] += 1
+                    elif i in ir.capsule_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        if '4' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.smokeable_product_list:
+                        if '7' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 7
+                        elif 'prerolls' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 2
+                        else:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.honey_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '5' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 5
+                        elif '2' in row['Variation Attributes']:
+                            pass
+                            # print('Reminder that packet honeys and jars need to separate')
+                    else:
+                        product_dict['quantity'] += row['Quantity Sold']
+            product_unit_amounts.append(product_dict)
+        return product_unit_amounts
+
+    def ingredient_dictionary(self):
+        """Creates a ingredient dictionary with all ingredients as keys and the cumulative volume across all
+        products as values."""
+        inventory = pd.read_csv('craftybase-export-material.csv')
+        ingredient_dictionary = {}
+        for i in list(inventory['name']):
+            ingredient_dictionary[i]=0
+        return ingredient_dictionary
+
+    def ingredient_volume_table(self):
+        """Creates a csv with ingredients and the cumulative volume used across a time span."""
+        for x in self.unit_counts:
+            for y in self.recipes:
+                if x['name'] == y['name']:
+                    for k, v in y.items():
+                        if k != 'name':
+                            self.ingredients[k] += v * x['quantity']
+        sorted_ingredient_volumes = sorted(self.ingredients.items(), key=lambda x: x[1], reverse=True)
+        output_frame = pd.DataFrame(data = sorted_ingredient_volumes, columns= ['Ingredient', 'Volume (gram or oz)'])
+        output_frame = output_frame[output_frame['Volume (gram or oz)'] !=0]
+        output_frame.to_csv('ingredient_volume_table.csv')
+

+ 662 - 0
BI/practica3.py

@@ -0,0 +1,662 @@
+# -*- coding: utf-8 -*-
+"""
+Autor:
+    Francisco Solano López Rodríguez
+Fecha:
+    Noviembre/2018
+Contenido:
+    Práctica 3
+    Inteligencia de Negocio
+    Grado en Ingeniería Informática
+    Universidad de Granada
+"""
+
+''' -------------------- IMPORT LIBRARY -------------------- '''
+
+import pandas as pd
+import numpy as np
+import time
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import Counter
+
+import datetime
+
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.feature_selection import VarianceThreshold
+from sklearn import ensemble
+
+''' --- classifiers import --- '''
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn import svm
+import xgboost as xgb
+import lightgbm as lgb
+from sklearn import tree
+
+from sklearn.svm import SVC, LinearSVC, NuSVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+
+from catboost import Pool, CatBoostClassifier
+
+''' --- preprocessing import --- '''
+from sklearn import preprocessing
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler  
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.preprocessing import scale
+from sklearn.preprocessing import Normalizer
+
+''' --- metrics import --- '''
+from sklearn import metrics
+from sklearn.metrics import roc_curve, auc
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score
+
+from math import sin, cos, sqrt, atan2, radians
+
+
+# Obtener datos respecto a la fecha y obtener la edad del pozo
+def date_parser(df):
+    date_recorder = list(map(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d'),
+                             df['date_recorded'].values))
+    df['year_recorder'] = list(map(lambda x: int(x.strftime('%Y')), date_recorder))
+    df['weekday_recorder'] = list(map(lambda x: int(x.strftime('%w')), date_recorder))
+    df['yearly_week_recorder'] = list(map(lambda x: int(x.strftime('%W')), date_recorder))
+    df['month_recorder'] = list(map(lambda x: int(x.strftime('%m')), date_recorder))
+    df['age'] = df['year_recorder'].values - df['construction_year'].values
+    del df['date_recorded']
+    return df
+
+
+# Obtener a distancia a la coordenada (0,0)
+def distancia(lon1, lat1, lon2, lat2):  
+    dlon = lon2 - lon1
+    dlat = lat2 - lat1
+
+    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
+    c = 2 * atan2(sqrt(a), sqrt(1 - a))
+    R = 6371
+
+    return R * c
+
+# Obtener la coordenada cartesiana x a partir de las longitud y la latitud
+def cartesian_x(lon, lat):
+    lat=radians(lat)
+    lon=radians(lon)
+    R=6371.0
+    x = R * cos(lat) * cos(lon)
+    return x
+
+# Obtener la coordenada cartesiana y a partir de las longitud y la latitud
+def cartesian_y(lon, lat):
+    lat=radians(lat)
+    lon=radians(lon)
+    R=6371.0
+    y = R * cos(lat) * sin(lon)
+    return y
+
+# Matriz de confusion
+def plot_confusion_matrix(y_test, predictions):
+    cm = metrics.confusion_matrix(y_test, predictions)
+    plt.figure(figsize=(9,9))
+    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True)
+    plt.ylabel('Actual label')
+    plt.xlabel('Predicted label')
+    plt.show()
+
+# Funcion para realizar la validacion cruzada
+def cross_validation(clf, X, y, cv = None, min_max_scaler = False, scaled = False, standard_scaler = False, normalizer = False, poly = False, m_confusion = False):
+
+    if cv == None:
+        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)
+
+    iteration = 0
+
+    for train, test in cv.split(X, y):
+
+        X_train, X_test = X[train], X[test]
+        y_train, y_test = y[train], y[test]
+
+
+        if min_max_scaler:
+            X_train = MinMaxScaler().fit_transform(X_train)
+            X_test = MinMaxScaler().fit_transform(X_test)
+
+        if scaled:
+            X_train = scale(X_train)
+            X_test = scale(X_test)
+
+        if poly:
+            X_train = PolynomialFeatures(degree = 2, interaction_only=True).fit_transform(X_train)
+            X_test = PolynomialFeatures(degree = 2, interaction_only=True).fit_transform(X_test)
+
+        if standard_scaler:
+            transformer = StandardScaler().fit(X_train)
+            X_train = transformer.transform(X_train)
+            X_test = transformer.transform(X_test)
+
+        if normalizer:
+            transformer = Normalizer().fit(X_train)
+            X_train = transformer.transform(X_train)
+            X_test = transformer.transform(X_test)
+
+        t = time.time()
+        clf = clf.fit(X_train,y_train)
+        training_time = time.time() - t
+
+        predictions_train = clf.predict(X_train)
+        predictions = clf.predict(X_test)
+
+        print("--------- Iteración ", iteration, " --------- ")
+        print("Tiempo :: ", training_time)
+        print ("Train Accuracy :: ", accuracy_score(y_train, predictions_train))
+        print ("Test Accuracy  :: ", accuracy_score(y_test, predictions))
+        print("")
+
+        if m_confusion:
+            plot_confusion_matrix(y_test, predictions)
+
+        iteration += 1
+
+''' ------------------------------------------------------------------ '''
+''' --------------------------- READ DATA ---------------------------- '''
+''' ------------------------------------------------------------------ '''
+
+print("\nWATER PUMP COMPETITION\n")
+
+print("Leyendo datos...")
+
+#los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos)
+data_x_orig = pd.read_csv('data/water_pump_tra.csv')
+data_y = pd.read_csv('data/water_pump_tra_target.csv')
+data_x_tst = pd.read_csv('data/water_pump_tst.csv')
+
+print(data_x_orig.shape)
+print(data_x_tst.shape)
+
+print("Lectura completada.\n")
+
+
+''' ------------------------------------------------------------------ '''
+''' -------------------------- LOOK AT DATA -------------------------- '''
+''' ------------------------------------------------------------------ '''
+
+print('Viendo los datos:\n')
+
+data_x = data_x_orig
+
+print('num_private:')
+print(data_x['num_private'].value_counts()[0:3])
+print('recorded_by:')
+print(data_x['recorded_by'].value_counts())
+print(data_y.status_group.value_counts()/len(data_y))
+
+data_y.status_group.value_counts().plot(kind='bar')
+plt.xticks(rotation = 0)
+plt.show()
+
+print('Ejemplos con longitude = 0')
+print(len(data_x.ix[data_x['longitude']==0,'longitude']))
+
+print('Ejemplos con latitude = 0')
+print(len(data_x.ix[data_x['latitude']==-0.00000002,'latitude']))
+
+print('Ejemplos con construction_year = 0')
+print(len(data_x.ix[data_x['construction_year']==0,'construction_year']))
+
+
+corr = data_x.corr()
+sns.heatmap (corr)
+plt.xticks(rotation =45)
+plt.show()
+
+print("Valores perdidos:")
+print(data_x.isnull().sum())
+
+data_x.isnull().sum().plot.bar()
+plt.show()
+
+print('funder:\n')
+print(data_x['funder'].value_counts()[0:6])
+print('\ninstaller:\n')
+print(data_x['installer'].value_counts()[0:6])
+print('\npublic_meeting:\n')
+print(data_x['public_meeting'].value_counts()[0:6])
+print('\nscheme_management:\n')
+print(data_x['scheme_management'].value_counts()[0:6])
+print('\npermit:\n')
+print(data_x['permit'].value_counts()[0:6])
+print('\nsubvillage:\n')
+print(data_x['subvillage'].value_counts()[0:6])
+print('\nwpt_name:\n')
+print(data_x['wpt_name'].value_counts()[0:6])
+
+'''
+data_x['funder'].value_counts()[0:10].plot.bar()
+plt.show()
+data_x['installer'].value_counts().plot.bar()
+plt.show()
+data_x['public_meeting'].value_counts().plot.bar()
+plt.show()
+data_x['scheme_management'].value_counts().plot.bar()
+plt.show()
+data_x['permit'].value_counts().plot.bar()
+plt.show()
+data_x['subvillage'].value_counts().plot.bar()
+plt.show()
+data_x['wpt_name'].value_counts().plot.bar()
+plt.show()
+'''
+
+''' ------------------------------------------------------------------ '''
+''' ------------------------- PREPROCESSING -------------------------- '''
+''' ------------------------------------------------------------------ '''
+
+print("\nPreprocesando datos...")
+
+data_x=data_x_orig.append(data_x_tst)
+
+
+''' ------------------ DROP COLUMNS ------------------ '''
+
+print("  Borrando columnas...")
+columns_to_drop = ['id', 'num_private', 'recorded_by', 'scheme_name']
+data_x.drop(labels=columns_to_drop, axis=1, inplace = True)
+data_y.drop(labels=['id'], axis=1,inplace = True)
+
+
+
+''' ------------------ MISSING VALUES ------------------ '''
+
+print("  Modificando valores nan...")
+data_x['funder'] = data_x['funder'].fillna('Government Of Tanzania')
+data_x['installer'] = data_x['installer'].fillna('DWE')
+data_x['public_meeting'] = data_x['public_meeting'].fillna(True)
+data_x['scheme_management'] = data_x['scheme_management'].fillna('VWC')
+data_x['permit'] = data_x['permit'].fillna(True)
+data_x['subvillage'] = data_x['subvillage'].fillna('Unknown')
+data_x['wpt_name'] = data_x['wpt_name'].fillna('none')
+
+data_x.ix[data_x['latitude']>-0.1,'latitude']=None
+data_x.ix[data_x['longitude']==0,'longitude']=None
+data_x["longitude"] = data_x.groupby("region_code").transform(lambda x: x.fillna(x.median())).longitude
+data_x["latitude"] = data_x.groupby("region_code").transform(lambda x: x.fillna(x.median())).latitude
+
+data_x.construction_year=pd.to_numeric(data_x.construction_year)
+data_x.loc[data_x.construction_year <= 0, data_x.columns=='construction_year'] = 1950
+
+# mean() tarda mucho, pero mejora un poco los resultados con respecto a median()
+#data_x=data_x.fillna(data_x.mean())
+#data_x = data_x.fillna(data_x.median())
+
+''' ------------------ RARE VALUES ------------------ '''
+
+print("  Etiquetando casos raros...")
+columns_other = [x for x in data_x.columns if x not in ['latitude','longitude','gps_height','age','population','construction_year','month_recorder']]
+
+for col in columns_other:
+    value_counts = data_x[col].value_counts()
+    lessthen = value_counts[value_counts < 20]
+    listnow = data_x.installer.isin(list(lessthen.keys()))
+    data_x.loc[listnow,col] = 'Others'
+
+
+''' ------------------ CARTESIAN ------------------ '''
+
+print("  Preprocesando coordenadas y distancias...")
+data_x['dist'] = data_x.apply(lambda row: distancia(row['longitude'], row['latitude'], 0, 0), axis=1)
+data_x['cartesian_x'] = data_x.apply(lambda row: cartesian_x(row['longitude'], row['latitude']), axis=1)
+data_x['cartesian_y'] = data_x.apply(lambda row: cartesian_y(row['longitude'], row['latitude']), axis=1)
+data_x.drop(labels=['longitude', 'latitude'], axis=1, inplace = True)
+
+''' ------------------ DATES ------------------ '''
+
+print("  Preprocesando fechas...")
+data_x = date_parser(data_x)
+
+
+
+data_x.population = data_x.population.apply(lambda x: np.log10(x+1))
+
+print("  Convirtiendo categóricas a numéricas...")
+data_x = data_x.astype(str).apply(LabelEncoder().fit_transform)
+
+data_x_tst = data_x[len(data_x_orig):]
+data_x = data_x[:len(data_x_orig)]
+
+X = data_x.values
+y = np.ravel(data_y.values)
+#y = le.fit(y).transform(y)
+X_tst = data_x_tst.values
+
+print("Datos preprocesados con éxito.\n")
+
+
+''' -------------------- CROSS VALIDATION -------------------- '''
+
+'''
+print("Validación cruzada:\n")
+
+print('\nKNN\n')
+knn = KNeighborsClassifier(n_neighbors=5)
+cross_validation(clf=knn, X = X, y = y, cv = None, min_max_scaler = True)
+
+print('\nXGB\n')
+clf = xgb.XGBClassifier(n_estimators = 200)
+cross_validation(clf, X, y)
+
+print('\nLGB\n')
+clf = lgb.LGBMClassifier(objective='binary', n_estimators=200, num_leaves=31)
+cross_validation(clf, X, y)
+
+print('\nRandomForest\n')
+clf = RandomForestClassifier(n_estimators=125, max_depth = 20, random_state = 10)
+cross_validation(clf, X, y)
+
+print('\nExtraTreesClassifier\n')
+clf = ExtraTreesClassifier(n_estimators = 125, max_depth = 20)
+cross_validation(clf, X, y)
+'''
+
+''' -------------------- SUBMISSION 1 -------------------- '''
+'''
+clf = xgb.XGBClassifier(n_estimators = 200)
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission1.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+''' -------------------- SUBMISSION 2 -------------------- '''
+'''
+clf = RandomForestClassifier(n_estimators = 125)
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission2.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+''' -------------------- SUBMISSION 3 -------------------- '''
+'''
+clf = RandomForestClassifier()
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission3.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+
+''' -------------------- SUBMISSION 6 -------------------- '''
+'''
+# Eliminated features:
+# 'num_private', 'recorded_by', 'region', 'scheme_name', 'scheme_management'
+
+clf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 500, random_state=10)
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission6.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+
+''' -------------------- SUBMISSION 8 -------------------- '''
+'''
+print("Submission 8")
+
+clf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 200, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission9.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+''' -------------------- SUBMISSION 11 -------------------- '''
+'''
+print("Submission 11")
+
+clf = RandomForestClassifier(n_estimators=200, max_depth = 20, random_state = 10)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission11.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 12 -------------------- '''
+'''
+print("Submission 12")
+
+clf = RandomForestClassifier(n_estimators=125, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission12.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 13 -------------------- '''
+'''
+print("Submission 13")
+
+fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=20)
+estimators = range(25,201,25)
+param_dist = {'n_estimators': estimators}
+
+clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission13.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 15 -------------------- '''
+'''
+print("Submission 15")
+
+clf = RandomForestClassifier(n_estimators=125, max_depth = 22)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission15.csv", index=False)
+'''
+''' -------------------- SUBMISSION 16 -------------------- '''
+'''
+print("Submission 16")
+
+clf = RandomForestClassifier(n_estimators=500)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission16.csv", index=False)
+
+# Nota: este experimento empeora los resultados, posible sobreentrenamiento
+'''
+
+''' -------------------- SUBMISSION 17 -------------------- '''
+'''
+print("Submission 17")
+
+clf = RandomForestClassifier(n_estimators=120, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission17.csv", index=False)
+
+'''
+
+''' -------------------- SUBMISSION 18 -------------------- '''
+'''
+# fillnan() with more repeated
+print("Submission 18")
+
+clf = RandomForestClassifier(n_estimators=160, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission18.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 19 -------------------- '''
+'''
+# fillnan() with more repeated
+print("Submission 19")
+
+clf = RandomForestClassifier(n_estimators=150, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission19.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 22 -------------------- '''
+'''
+print("Submission 22")
+
+fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=20)
+estimators = range(25,201,25)
+param_dist = {'n_estimators': estimators}
+
+clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission22.csv", index=False)
+
+best_param = clf.best_params_['n_estimators']
+print ("Mejor valor para n_estimators: ", best_param)
+'''
+''' -------------------- SUBMISSION 23 -------------------- '''
+'''
+print("Submission 23")
+
+fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=25)
+estimators = range(100,1101,25)
+param_dist = {'n_estimators': estimators}
+
+clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission23.csv", index=False)
+
+best_param = clf.best_params_['n_estimators']
+print ("Mejor valor para n_estimators: ", best_param)
+'''
+
+
+''' -------------------- SUBMISSION 24 -------------------- '''
+'''
+print("Submission 24")
+
+clf = RandomForestClassifier(n_estimators=100, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission24.csv", index=False)
+
+'''
+''' -------------------- SUBMISSION 25 -------------------- '''
+'''
+print("Submission 25")
+
+clf = RandomForestClassifier(n_estimators=150, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission25.csv", index=False)
+'''
+
+
+''' ------------------- FINAL SUBMISSION ------------------ '''
+
+''' -------------------- SUBMISSION 26 -------------------- '''
+
+print("Submission 26")
+
+clf = RandomForestClassifier(n_estimators = 125, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission26.csv", index=False)

+ 98 - 0
Directory/IOTA2Directory.py

@@ -0,0 +1,98 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+# =========================================================================
+#   Program:   iota2
+#
+#   Copyright (c) CESBIO. All rights reserved.
+#
+#   See LICENSE for details.
+#
+#   This software is distributed WITHOUT ANY WARRANTY; without even
+#   the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+#   PURPOSE.  See the above copyright notices for more information.
+#
+# =========================================================================
+
+
+import os
+import shutil
+from Common import ServiceConfigFile as SCF
+
+
+def GenerateDirectories(cfg):
+    """
+    generate IOTA2 output directories
+    """
+    if not isinstance(cfg, SCF.serviceConfigFile):
+        cfg = SCF.serviceConfigFile(cfg)
+
+    root = cfg.getParam('chain', 'outputPath')
+    rm_PathTEST = cfg.getParam("chain", "remove_outputPath")
+    start_step = cfg.getParam("chain", "firstStep")
+
+    if os.path.exists(root) and root != "/" and rm_PathTEST and start_step == "init":
+        shutil.rmtree(root,ignore_errors=False)
+    os.mkdir(root)
+    if os.path.exists(root+"/logs"):
+        shutil.rmtree(root+"/logs")
+    os.mkdir(root+"/logs")
+    if os.path.exists(root+"/samplesSelection"):
+        shutil.rmtree(root+"/samplesSelection")
+    os.mkdir(root+"/samplesSelection")
+    if os.path.exists(root+"/model"):
+        shutil.rmtree(root+"/model")
+    os.mkdir(root+"/model")
+    if os.path.exists(root+"/formattingVectors"):
+        shutil.rmtree(root+"/formattingVectors")
+    os.mkdir(root+"/formattingVectors")
+    if os.path.exists(root+"/config_model"):
+        shutil.rmtree(root+"/config_model")
+    os.mkdir(root+"/config_model")
+    if os.path.exists(root+"/envelope"):
+        shutil.rmtree(root+"/envelope")
+    os.mkdir(root+"/envelope")
+    if os.path.exists(root+"/classif"):
+        shutil.rmtree(root+"/classif")
+    os.mkdir(root+"/classif")
+    if os.path.exists(root+"/shapeRegion"):
+        shutil.rmtree(root+"/shapeRegion")
+    os.mkdir(root+"/shapeRegion")
+    if os.path.exists(root+"/final"):
+        shutil.rmtree(root+"/final")
+    os.mkdir(root+"/final")
+    os.mkdir(root+"/final/simplification")
+    os.mkdir(root+"/final/simplification/tiles")
+    os.mkdir(root+"/final/simplification/vectors")    
+    os.mkdir(root+"/final/simplification/tmp")
+    if os.path.exists(root+"/features"):
+        shutil.rmtree(root+"/features")
+    os.mkdir(root+"/features")
+    if os.path.exists(root+"/dataRegion"):
+        shutil.rmtree(root+"/dataRegion")
+    os.mkdir(root+"/dataRegion")
+    if os.path.exists(root+"/learningSamples"):
+        shutil.rmtree(root+"/learningSamples")
+    os.mkdir(root+"/learningSamples")
+    if os.path.exists(root+"/dataAppVal"):
+        shutil.rmtree(root+"/dataAppVal")
+    os.mkdir(root+"/dataAppVal")
+    if os.path.exists(root+"/stats"):
+        shutil.rmtree(root+"/stats")
+    os.mkdir(root+"/stats")
+    
+    if os.path.exists(root+"/cmd"):
+        shutil.rmtree(root+"/cmd")
+    os.mkdir(root+"/cmd")
+    os.mkdir(root+"/cmd/stats")
+    os.mkdir(root+"/cmd/train")
+    os.mkdir(root+"/cmd/cla")
+    os.mkdir(root+"/cmd/confusion")
+    os.mkdir(root+"/cmd/features")
+    os.mkdir(root+"/cmd/fusion")
+    os.mkdir(root+"/cmd/splitShape")
+
+    merge_final_classifications = cfg.getParam('chain', 'merge_final_classifications')
+    if merge_final_classifications:
+        if os.path.exists(root+"/final/merge_final_classifications"):
+            shutil.rmtree(root+"/final/merge_final_classifications")

+ 31 - 0
Directory/advance_touch.py

@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Libraries
+import os
+import click
+
+@click.command()
+@click.argument('paths', nargs=-1)
+@click.option('-cd/--change', is_flag=True, default=False, help='After creating the directories, change to the new deeper directory.')
+def advance_touch(paths, cd):
+    """ Make folders and files """
+    for path in paths:
+        # Make folders
+        new_dirs = '/'.join(path.split('/')[0:-1])
+        if not os.path.exists(new_dirs) and new_dirs != '':
+            os.makedirs(new_dirs)
+        # Change directory
+        if cd:
+            cd_path = os.path.join(os.getcwd(), new_dirs) + '/'
+            os.chdir(cd_path)
+
+        # Make file
+        if not path.endswith('/') and not os.path.isfile(path):
+            try:
+                open(path, 'w+').close()
+            except IsADirectoryError:
+                pass
+
+if __name__ == '__main__':
+    advance_touch()

+ 213 - 0
Directory/augmentation_main.py

@@ -0,0 +1,213 @@
+from __future__ import print_function, unicode_literals
+import os
+from twisted.python import filepath
+from twisted.trial import unittest
+from .. import database
+from ..database import (CHANNELDB_TARGET_VERSION, USAGEDB_TARGET_VERSION,
+                        _get_db, dump_db, DBError)
+
+class Get(unittest.TestCase):
+    def test_create_default(self):
+        db_url = ":memory:"
+        db = _get_db(db_url, "channel", CHANNELDB_TARGET_VERSION)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], CHANNELDB_TARGET_VERSION)
+
+    def test_open_existing_file(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "normal.db")
+        db = _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], CHANNELDB_TARGET_VERSION)
+        db2 = _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        rows = db2.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], CHANNELDB_TARGET_VERSION)
+
+    def test_open_bad_version(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "old.db")
+        db = _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        db.execute("UPDATE version SET version=999")
+        db.commit()
+
+        with self.assertRaises(DBError) as e:
+            _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        self.assertIn("Unable to handle db version 999", str(e.exception))
+
+    def test_open_corrupt(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "corrupt.db")
+        with open(fn, "wb") as f:
+            f.write(b"I am not a database")
+        with self.assertRaises(DBError) as e:
+            _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        self.assertIn("not a database", str(e.exception))
+
+    def test_failed_create_allows_subsequent_create(self):
+        patch = self.patch(database, "get_schema", lambda version: b"this is a broken schema")
+        dbfile = filepath.FilePath(self.mktemp())
+        self.assertRaises(Exception, lambda: _get_db(dbfile.path))
+        patch.restore()
+        _get_db(dbfile.path, "channel", CHANNELDB_TARGET_VERSION)
+
+    def test_upgrade(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "upgrade.db")
+        self.assertNotEqual(USAGEDB_TARGET_VERSION, 1)
+
+        # create an old-version DB in a file
+        db = _get_db(fn, "usage", 1)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], 1)
+        del db
+
+        # then upgrade the file to the latest version
+        dbA = _get_db(fn, "usage", USAGEDB_TARGET_VERSION)
+        rows = dbA.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], USAGEDB_TARGET_VERSION)
+        dbA_text = dump_db(dbA)
+        del dbA
+
+        # make sure the upgrades got committed to disk
+        dbB = _get_db(fn, "usage", USAGEDB_TARGET_VERSION)
+        dbB_text = dump_db(dbB)
+        del dbB
+        self.assertEqual(dbA_text, dbB_text)
+
+        # The upgraded schema should be equivalent to that of a new DB.
+        latest_db = _get_db(":memory:", "usage", USAGEDB_TARGET_VERSION)
+        latest_text = dump_db(latest_db)
+        with open("up.sql","w") as f: f.write(dbA_text)
+        with open("new.sql","w") as f: f.write(latest_text)
+        # debug with "diff -u _trial_temp/up.sql _trial_temp/new.sql"
+        self.assertEqual(dbA_text, latest_text)
+
+    def test_upgrade_fails(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "upgrade.db")
+        self.assertNotEqual(USAGEDB_TARGET_VERSION, 1)
+
+        # create an old-version DB in a file
+        db = _get_db(fn, "usage", 1)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], 1)
+        del db
+
+        # then upgrade the file to a too-new version, for which we have no
+        # upgrader
+        with self.assertRaises(DBError):
+            _get_db(fn, "usage", USAGEDB_TARGET_VERSION+1)
+
+class CreateChannel(unittest.TestCase):
+    def test_memory(self):
+        db = database.create_channel_db(":memory:")
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_preexisting(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "preexisting.db")
+        with open(fn, "w"):
+            pass
+        with self.assertRaises(database.DBAlreadyExists):
+            database.create_channel_db(fn)
+
+    def test_create(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_channel_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_create_or_upgrade(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_or_upgrade_channel_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+class CreateUsage(unittest.TestCase):
+    def test_memory(self):
+        db = database.create_usage_db(":memory:")
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_preexisting(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "preexisting.db")
+        with open(fn, "w"):
+            pass
+        with self.assertRaises(database.DBAlreadyExists):
+            database.create_usage_db(fn)
+
+    def test_create(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_usage_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_create_or_upgrade(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_or_upgrade_usage_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_create_or_upgrade_disabled(self):
+        db = database.create_or_upgrade_usage_db(None)
+        self.assertIs(db, None)
+
+class OpenChannel(unittest.TestCase):
+    def test_open(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db1 = database.create_channel_db(fn)
+        latest_text = dump_db(db1)
+        self.assertIn("CREATE TABLE", latest_text)
+        db2 = database.open_existing_db(fn)
+        self.assertIn("CREATE TABLE", dump_db(db2))
+
+    def test_doesnt_exist(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        with self.assertRaises(database.DBDoesntExist):
+            database.open_existing_db(fn)
+
+class OpenUsage(unittest.TestCase):
+    def test_open(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db1 = database.create_usage_db(fn)
+        latest_text = dump_db(db1)
+        self.assertIn("CREATE TABLE", latest_text)
+        db2 = database.open_existing_db(fn)
+        self.assertIn("CREATE TABLE", dump_db(db2))
+
+    def test_doesnt_exist(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        with self.assertRaises(database.DBDoesntExist):
+            database.open_existing_db(fn)
+

+ 92 - 0
Directory/conftest.py

@@ -0,0 +1,92 @@
+import os
+import shutil
+
+import pytest
+
+
+def create_file(path: str, content: str):
+    """Create txt file with specific content"""
+    with open(f"{path}", "w") as file:
+        file.write(content)
+
+
+@pytest.fixture
+def create_files():
+    """Create files with equal or non-equal content"""
+    create_file("tests/file1.txt", "hello, world")
+    create_file("tests/file2.txt", "hello, world!")
+    create_file("tests/file3.txt", "hello, world")
+    yield
+    os.remove("tests/file1.txt")
+    os.remove("tests/file2.txt")
+    os.remove("tests/file3.txt")
+
+
+@pytest.fixture
+def create_dirs_and_files():
+    os.makedirs("tests/dir1/dir2")
+    os.makedirs("tests/dir3/dir4")
+    create_file("tests/dir1/file1.txt", "aaa")
+    create_file("tests/dir3/file2.txt", "bbb")
+    yield
+    shutil.rmtree("tests/dir1")
+    shutil.rmtree("tests/dir3")
+
+
+@pytest.fixture
+def create_nested_dirs_and_files_first_case():
+    """Create common case for synch function"""
+    os.makedirs("tests/source/dir1")
+    os.mkdir("tests/source/dir2")
+    os.mkdir("tests/source/dir3")
+    create_file("tests/source/dir1/file1.txt", "abacaba")
+    os.makedirs("tests/replica/dir1")
+    os.mkdir("tests/replica/dir4")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_nested_dirs_and_files_second_case():
+    """Create common case for synch function"""
+    os.makedirs("tests/source/dir1/dir2")
+    create_file("tests/source/dir1/dir2/file1.txt", "hello")
+    os.makedirs("tests/replica/dir1")
+    os.mkdir("tests/replica/dir4")
+    create_file("tests/replica/dir4/file2.txt", "hello")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_two_different_files():
+    """Create two different files"""
+    os.mkdir("tests/source")
+    os.mkdir("tests/replica")
+    create_file("tests/source/file1.txt", "aaa")
+    create_file("tests/replica/file1.txt", "bbb")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_empty_source_dir():
+    """Create empty source dir and non-empty replic's one"""
+    os.mkdir("tests/source")
+    os.makedirs("tests/replica/dir1")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_empty_replica_dir():
+    """Create empty replica dir and non-empty source one"""
+    os.makedirs("tests/source/dir1/dir2")
+    os.mkdir("tests/replica")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")

+ 394 - 0
Directory/data_preprocessing_utils.py

@@ -0,0 +1,394 @@
+# Customary Imports:
+import tensorflow as tf
+assert '2.' in tf.__version__  # make sure you're using tf 2.0
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import sklearn
+import skimage
+import cv2 as cv
+import os
+import datetime
+import scipy
+from skimage.morphology import reconstruction
+from skimage import exposure
+import scipy.io as sio
+import h5py
+import random
+import shutil
+import PIL
+import imageio
+import pydot 
+import graphviz
+import plotly.graph_objects as go
+import preprocess_crop
+from pathlib import Path
+from tensorflow.keras import backend as K
+from PIL import Image
+from keras.preprocessing.image import ImageDataGenerator
+from tensorflow.keras.layers import Dense, Flatten, Conv2D
+from tensorflow.keras import Model
+#from keras.utils import CustomObjectScope
+from mpl_toolkits.mplot3d import Axes3D
+import data_preprocessing_utils
+##################################################################################################################################
+'''
+DATA PREPROCESSING UTILS:
+'''
+##################################################################################################################################
+# Converting MAP Files:
+def convert_MAP(directory, output_directory, min_shape, file_format = '.npy', search_keys = None, dtype = np.float32):
+    '''
+    This program loops through given raw_data directory
+    and converts .mat files to .npy files
+    '''
+    new_dir = os.path.join(os.getcwd(), output_directory)
+    if not os.path.exists(new_dir):
+        os.mkdir(new_dir)
+    else:
+        shutil.rmtree(new_dir)
+        os.mkdir(new_dir)
+    for file in os.listdir(directory):
+        filename = os.fsdecode(file)
+        if filename.endswith(".mat"): 
+            #print(os.path.join(directory, filename))
+            filepath = os.path.join(directory, filename)
+            array_dict = {}
+            try:
+                f = h5py.File(filepath, 'r')
+            except:
+                f = sio.loadmat(filepath)
+            for k, v in f.items():
+                array_dict[k] = np.array(v, dtype = np.float32)
+            # As we only need image info from dict (the last key) we do this
+            if search_keys == None:
+                search_keys = 'map' # out of struct of .mat files want "map"
+                filtered_dict = dict(filter(lambda item: search_keys in item[0], array_dict.items()))
+            else:
+                filtered_dict = {}
+                for i in range(len(search_keys)):
+                    search_key = search_keys[i]
+                    if search_key in array_dict:
+                        filtered_dict[search_key] = array_dict[search_key]
+            if len(filtered_dict) == 0:
+                print('No Data to Meet Search Key Requirements: Datapoint Rejected -> ' + filepath)
+            else:
+                #print(list(array_dict.keys()))
+                #print(filtered_dict)
+                arrays = []
+                for k, v in filtered_dict.items():
+                    temp = np.transpose(v.astype(np.float32))
+                    # To normalize data between [-1,1], use -> arrays = arrays/(np.max(arrays)/2) - 1
+                    # To normalize data between [0,1], use -> arrays = arrays/(np.max(arrays))
+                    # To normalize data between [0,255], 
+                    #     use -> arrays = (arrays/(np.max(arrays))*255).astype(np.uint8)
+                    temp = temp/(np.max(temp))
+                    arrays.append(temp)
+                for i in range(len(arrays)):
+                    if len(arrays[i].shape) > 2:
+                        #print(arrays[i].shape)
+                        arrays[i] = np.mean(arrays[i], axis = 2)
+
+                for i in range(len(arrays)):
+                    new_dir_filepath = os.path.join(new_dir, filename.strip('.mat') 
+                                                    + '_index'+str(i) + file_format)
+                    array = arrays[i]
+                    if array.shape[0] >= min_shape[0] and array.shape[1] >= min_shape[1]:
+                        if file_format == '.npy':
+                            np.save(new_dir_filepath, array, allow_pickle=True, fix_imports=True)
+                        else:
+                            imageio.imwrite(new_dir_filepath, array)
+                    elif i == 0:
+                        print('Min Size Not Met: Datapoint Rejected -> ' + filepath)
+    return os.path.join(os.getcwd(), output_directory)
+
+##################################################################################################################################
+# Data Cleaning Procedures:
+def data_clean_func(image = None):
+    if image is not None:
+        #print(len(np.unique(image)))
+        #clean_image = image
+        '''
+        plt.hist(image)
+        plt.show()
+        '''
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('Original Image')
+        plt.show()
+        '''
+        threshold = 0.85
+        default_fill = 0.0
+        frac_of_high_clip = 1/9
+        image[image > threshold] = default_fill
+        image[image < frac_of_high_clip*(1.0-threshold)] = default_fill
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('After Clipping')
+        plt.show()
+        '''
+        image = scipy.ndimage.median_filter(image, size=(4, 4))
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('After Median Filter')
+        plt.show()
+        '''
+        image = skimage.filters.gaussian(image, sigma=0.01, output=None, mode='reflect', preserve_range=True)
+        ####################################################################
+        # Added to ensure negligible loss when converting to int16 
+        # within exposure.equalize_adapthist
+        image = (image/np.max(image)*(2**16)).astype(np.uint16)
+        # A "Monkey Patch" could possibly be used as a cleaner solution, 
+        # but would be more involved than is necessary for my application
+        ####################################################################
+        image = exposure.equalize_adapthist(image,kernel_size=image.shape[0]//8, clip_limit=0.005, nbins=2**13)
+        image = image.astype(np.float64)
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('After Local Adapt Hist')
+        plt.show()
+        '''
+        image = scipy.ndimage.median_filter(image, size=(3, 1))
+        image = scipy.ndimage.median_filter(image, size=(1, 3))
+        image = skimage.filters.gaussian(image, sigma=0.1, output=None, mode='reflect', preserve_range=True)
+        image = exposure.rescale_intensity(image, in_range='image', out_range=(0.0,1.0))
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('Final Image')
+        plt.show()
+        '''
+        '''
+        plt.hist(image)
+        plt.show()
+        '''
+        clean_image = image.astype(np.float32)
+    else:
+        clean_image = image
+    return clean_image
+
+def data_cleaning(input_dir = 'converted_data', output_dir_name = 'cleaned_data',
+                  output_file_format ='.npy', delete_previous = True):
+    '''
+     This program seeks to remove some noise from the data
+     and make the underlying vessel structure more prominent
+     Input: input_dir -> directory that holds data to be cleaned
+            output_dir -> directory to hold cleaned data
+     Output: None
+    '''
+    file_list = os.listdir(input_dir)
+    clean_dir = os.path.join(os.getcwd(), output_dir_name)
+    if not os.path.exists(clean_dir):
+        os.mkdir(clean_dir)
+    elif delete_previous == True:
+        shutil.rmtree(clean_dir)
+        os.mkdir(clean_dir)
+    for file in file_list:
+        filename = os.fsdecode(file)
+        filepath = os.path.join(input_dir, filename)
+        if filepath.endswith('.npy'):
+            array = np.load(filepath)
+        else:
+            array = imageio.imread(filepath)
+            
+        # Defined data clean function above:
+        array = data_preprocessing_utils.data_clean_func(array)
+    
+        new_filepath = os.path.join(clean_dir, filename)
+        if output_file_format == '.npy':
+            new_filepath = Path(new_filepath)
+            new_filepath = new_filepath.with_suffix('')
+            new_filepath = new_filepath.with_suffix(output_file_format)
+            np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
+        else:
+            new_filepath = Path(new_filepath)
+            new_filepath = new_filepath.with_suffix('')
+            new_filepath = new_filepath.with_suffix(output_file_format)
+            imageio.imwrite(new_filepath, array)
+    return  
+
+    
+##################################################################################################################################
+# Data Seperation / Validation Split Procedures:
+def data_seperation(input_dir, dataset_percentages, 
+                    delete_previous = False, file_format = '.npy', 
+                    scale = 1):
+    '''
+    Takes numpy array and creates data folder with seperate sections
+    for training, validation, and testing according to given percentages
+    Input: numpy dir -> contains file path to data folder of numpy files
+           dataset_percentages -> (% train, % test) such that % train + % test = 100
+           OR
+           dataset_percentages -> (% train, % val, % test) such that % train + % val + % test = 100
+    Output: new folders for training and testing or training/validation/testing
+    '''
+    
+    # If just train and test
+    if len(dataset_percentages) == 2:
+        # Making Main data folder
+        new_dir = os.path.join(os.getcwd(), 'data')
+        if not os.path.exists(new_dir):
+            os.mkdir(new_dir)
+        
+        # Making train subfolder
+        train_dir = os.path.join(new_dir, 'train')
+        if not os.path.exists(train_dir):
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        elif delete_previous == True:
+            shutil.rmtree(train_dir)
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        
+        # Making test subfolder
+        test_dir = os.path.join(new_dir, 'test')
+        if not os.path.exists(test_dir):
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+        elif delete_previous == True:
+            shutil.rmtree(test_dir)
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+
+
+        file_list = os.listdir(input_dir)
+        total_num_imgs = len(file_list)
+        train_percent = dataset_percentages[0]
+        test_percent = dataset_percentages[1]
+        valid_inputs = (train_percent >= test_percent and train_percent <= 100 and
+                        test_percent <= 100 and train_percent > 0 and test_percent > 0 and
+                        train_percent + test_percent == 100)
+        if valid_inputs:
+            num_train = int(round(total_num_imgs * train_percent//100))
+        else:
+            num_train = int(round(total_num_imgs * 0.9))
+            print('ERROR: Please input valid percentages for dataset division')
+            print('In place of valid input the ratio 90% train, 10% test was used')
+        
+        index = 0
+        random.shuffle(file_list)
+        for file in file_list:
+            filename = os.fsdecode(file)
+            filepath = os.path.join(input_dir, filename)
+            # Loads File
+            if filepath.endswith('.npy'):
+                array = np.load(filepath)
+                array = array/np.max(array)*scale
+            else:
+                array = imageio.imread(filepath)
+                array = array/np.max(array)*scale
+            if index < num_train:
+                new_filepath = os.path.join(train_dir, filename)
+            else:
+                new_filepath = os.path.join(test_dir, filename)
+            # Saves File
+            if file_format == '.npy':
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
+            else:
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                imageio.imwrite(new_filepath, array)
+            index += 1
+        return train_dir, test_dir
+    # If train, val, and test
+    elif len(dataset_percentages) == 3:
+        # Making Main data folder
+        new_dir = os.path.join(os.getcwd(), 'data')
+        if not os.path.exists(new_dir):
+            os.mkdir(new_dir)
+            
+        # Making train subfolder
+        train_dir = os.path.join(new_dir, 'train')
+        if not os.path.exists(train_dir):
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        elif delete_previous == True:
+            shutil.rmtree(train_dir)
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        
+        # Making val subfolder
+        val_dir = os.path.join(new_dir, 'val')
+        if not os.path.exists(val_dir):
+            os.mkdir(val_dir)
+            val_dir = os.path.join(val_dir, 'input')
+            os.mkdir(val_dir)
+        elif delete_previous == True:
+            shutil.rmtree(val_dir)
+            os.mkdir(val_dir)
+            val_dir = os.path.join(val_dir, 'input')
+            os.mkdir(val_dir)
+        
+        # Making test subfolder
+        test_dir = os.path.join(new_dir, 'test')
+        if not os.path.exists(test_dir):
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+        elif delete_previous == True:
+            shutil.rmtree(test_dir)
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+            
+        file_list = os.listdir(input_dir)
+        total_num_imgs = len(file_list)
+        train_percent = dataset_percentages[0]
+        val_percent = dataset_percentages[1]
+        test_percent = dataset_percentages[2]
+        valid_inputs = (train_percent >= test_percent and train_percent >= val_percent 
+                        and train_percent <= 100 and val_percent <= 100 and test_percent <= 100
+                        and train_percent > 0 and val_percent > 0 and test_percent > 0 and
+                        train_percent + val_percent + test_percent == 100)
+        if valid_inputs:
+            num_train = int(round(total_num_imgs * train_percent//100))
+            num_val = int(round(total_num_imgs * val_percent//100))
+        else:
+            num_train = int(round(total_num_imgs * 0.9))
+            num_val = int(round((total_num_imgs - num_train)/2))
+            print('ERROR: Please input valid percentages for dataset division')
+            print('In place of a valid input the ratio 90% train, 5% val, 5% test was used')
+        
+        index = 0
+        random.shuffle(file_list)
+        for file in file_list:
+            filename = os.fsdecode(file)
+            filepath = os.path.join(input_dir, filename)
+            # Loads File
+            if filepath.endswith('.npy'):
+                array = np.load(filepath)
+                array = array/np.max(array)*scale
+            else:
+                array = imageio.imread(filepath)
+                array = array/np.max(array)*scale
+            if index < num_train:
+                new_filepath = os.path.join(train_dir, filename)
+            elif index <= num_train + num_val:
+                new_filepath = os.path.join(val_dir, filename)
+            else:
+                new_filepath = os.path.join(test_dir, filename)
+            # Saves File
+            if file_format == '.npy':
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
+            else:
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                imageio.imwrite(new_filepath, array)
+            index += 1
+        return train_dir, val_dir, test_dir
+    else:
+        print('ERROR: Please divide into train/test or train/val/test')
+        return None

+ 122 - 0
Directory/diml_to_interiornet.py

@@ -0,0 +1,122 @@
+import cv2
+import os
+import shutil
+import numpy as np
+
+
+def sample_to_interiornet():
+    diml_path = "/nvme/datasets/diml_depth/scenes"
+    hd7_path = "/nvme/datasets/diml_depth/HD7"
+
+    depth_paths = [
+        "/nvme/datasets/diml_depth/train/HR/11. Bedroom/depth_filled",
+        "/nvme/datasets/diml_depth/train/HR/12. Livingroom/depth_filled"]
+
+    depth_images = []
+    for path in depth_paths:
+        depth_images += [os.path.join(path, name) for name in os.listdir(path)
+                         if os.path.isfile(os.path.join(path, name))]
+
+    scene_paths = [os.path.join(diml_path, name) for name in os.listdir(diml_path)
+                   if os.path.isdir(os.path.join(diml_path, name))]
+
+    for scene_path in scene_paths:
+        frame_paths = [os.path.join(scene_path, name) for name in os.listdir(scene_path)
+                       if os.path.isfile(os.path.join(scene_path, name))]
+
+        new_frame_path = os.path.join(hd7_path, scene_path.split('/')[-1])
+        os.mkdir(new_frame_path)
+        os.mkdir(os.path.join(new_frame_path, "cam0"))
+        os.mkdir(os.path.join(new_frame_path, "depth0"))
+        os.mkdir(os.path.join(new_frame_path, "label0"))
+        os.mkdir(os.path.join(new_frame_path, "cam0", "data"))
+        os.mkdir(os.path.join(new_frame_path, "depth0", "data"))
+        os.mkdir(os.path.join(new_frame_path, "label0", "data"))
+        print(new_frame_path)
+        for i, frame_path in enumerate(frame_paths):
+            file_name = frame_path.split('/')[-1][:-6]
+            img = cv2.imread(frame_path, cv2.IMREAD_UNCHANGED)
+            print(file_name)
+            depth_path = [path for path in depth_images if file_name in path][0]
+            depth_img = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+
+            img = cv2.resize(img, dsize=(img.shape[1] / 2, img.shape[0] / 2), interpolation=cv2.INTER_LINEAR)
+            depth_img = cv2.resize(depth_img, dsize=(depth_img.shape[1] / 2, depth_img.shape[0] / 2),
+                                   interpolation=cv2.INTER_LINEAR)
+            label_img = depth_img.copy()
+            label_img[:, :] = 3
+
+            cv2.imwrite(os.path.join(new_frame_path, "cam0", "data", "{}.png".format(i)), img)
+            cv2.imwrite(os.path.join(new_frame_path, "depth0", "data", "{}.png".format(i)), depth_img)
+            cv2.imwrite(os.path.join(new_frame_path, "label0", "data", "{}_instance.png".format(i)), label_img)
+            cv2.imwrite(os.path.join(new_frame_path, "label0", "data", "{}_nyu.png".format(i)), label_img)
+
+
+def full_to_interiornet():
+    scene_file_path = "/nvme/datasets/diml_depth/scenes.txt"
+    base_path = "/nvme/datasets/diml_depth/"
+    out_path = "/nvme/datasets/diml_depth/HD7/"
+    cam0_render = "/nvme/datasets/interiornet/3FO4IDEI1LAV_Bedroom/cam0.render"
+    num_frames = 20
+    shape = (672, 378)
+    np.random.seed(123)
+
+    with open(scene_file_path, 'r') as f:
+        scene_lines = f.readlines()
+
+    scene_lines = [sn.split('\n')[0] for sn in scene_lines]
+    scene_paths = [os.path.join(base_path, sn.split('-')[0]) for sn in scene_lines]
+    scene_ranges = [sn.split('-')[1] for sn in scene_lines]
+    scene_ranges = [(int(rn[1:-1].split(':')[0]), int(rn[1:-1].split(':')[1])) for rn in scene_ranges]
+
+    for i, scene_path in enumerate(scene_paths):
+        file_list = []
+        for j in range(scene_ranges[i][0], scene_ranges[i][1]+1):
+            scene_path_col = os.path.join(scene_path, "{}/col".format(j))
+            if os.path.exists(scene_path_col):
+                file_list += [os.path.join(scene_path_col, dn) for dn in os.listdir(scene_path_col)]
+
+        scene_count = len(os.listdir(out_path))
+        scene_out_path = "{:02d}DIML_{}".format(scene_count + 1, scene_path.split('/')[-2].split(' ')[1])
+        scene_out_path = os.path.join(out_path, scene_out_path)
+
+        if os.path.exists(scene_out_path):
+            shutil.rmtree(scene_out_path)
+        os.mkdir(scene_out_path)
+        os.mkdir(os.path.join(scene_out_path, "cam0"))
+        os.mkdir(os.path.join(scene_out_path, "depth0"))
+        os.mkdir(os.path.join(scene_out_path, "label0"))
+        os.mkdir(os.path.join(scene_out_path, "cam0", "data"))
+        os.mkdir(os.path.join(scene_out_path, "depth0", "data"))
+        os.mkdir(os.path.join(scene_out_path, "label0", "data"))
+        shutil.copyfile(cam0_render, os.path.join(scene_out_path, "cam0.render"))
+        print(scene_out_path)
+
+        frame_paths = np.random.choice(file_list, num_frames, False)
+        for j, frame_path in enumerate(frame_paths):
+            img = cv2.imread(frame_path, cv2.IMREAD_UNCHANGED)
+            depth_path = frame_path.replace('/col/', '/up_png/')
+            depth_path = depth_path.replace('_c.png', '_ud.png')
+            depth_img = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+
+            if depth_img is None:
+                print(depth_path)
+                exit()
+            if img is None:
+                print(frame_path)
+                exit()
+
+            img = cv2.resize(img, dsize=shape, interpolation=cv2.INTER_LINEAR)
+            depth_img = cv2.resize(depth_img, dsize=shape,
+                                   interpolation=cv2.INTER_LINEAR)
+            label_img = depth_img.copy()
+            label_img[:, :] = 3
+
+            cv2.imwrite(os.path.join(scene_out_path, "cam0", "data", "{}.png".format(j)), img)
+            cv2.imwrite(os.path.join(scene_out_path, "depth0", "data", "{}.png".format(j)), depth_img)
+            cv2.imwrite(os.path.join(scene_out_path, "label0", "data", "{}_instance.png".format(j)), label_img)
+            cv2.imwrite(os.path.join(scene_out_path, "label0", "data", "{}_nyu.png".format(j)), label_img)
+
+
+if __name__ == '__main__':
+    full_to_interiornet()

+ 177 - 0
Directory/ego_to_json.py

@@ -0,0 +1,177 @@
+import os 
+import shutil 
+import json
+import scipy.io
+import random
+
+# ego_to_json.py maakt egohands_data bruikbaar om om te laten zetten naar de verschillende
+# formaten voor de netwerken.
+#drie folders, train, test, val
+
+# |-- train
+# |  | -- images
+# |  | -- annotations.json
+# |-- val
+# |  | -- images
+# |  | -- annotations.json
+# |-- test
+# |  | -- images
+# |  | -- annotations.json
+
+
+# annotations.json:
+# { 
+#     "CARDS_OFFICE_H_T_frame_0001.jpg": 
+#     {
+#         "name": "CARDS_OFFICE_H_T_frame_0001.jpg",
+#         "objects": [[]]
+#     },
+#     "CARDS_OFFICE_H_T_frame_0002.jpg":
+#     {
+#         "name": "CARDS_OFFICE_H_T_frame_0002.jpg",
+#         "objects": [[]]
+#     }
+# }
+
+ROOT_DIR = "../egohands_data"
+ANNOTATION_FILE = "polygons.mat"
+SAVE_FILE = "annotations.json"
+
+
+def split_test():
+    os.makedirs(os.path.join(ROOT_DIR, "test"))
+    os.makedirs(os.path.join(ROOT_DIR, "val"))
+    os.makedirs(os.path.join(ROOT_DIR, "train"))
+    
+    shutil.move(os.path.join(ROOT_DIR, "CARDS_COURTYARD_B_T"), os.path.join(ROOT_DIR, "test", "CARDS_COURTYARD_B_T"))
+    shutil.move(os.path.join(ROOT_DIR, "CARDS_OFFICE_S_B"), os.path.join(ROOT_DIR, "test", "CARDS_OFFICE_S_B"))
+    shutil.move(os.path.join(ROOT_DIR, "CHESS_COURTYARD_B_T"), os.path.join(ROOT_DIR, "test", "CHESS_COURTYARD_B_T"))
+    shutil.move(os.path.join(ROOT_DIR, "CHESS_LIVINGROOM_T_H"), os.path.join(ROOT_DIR, "test", "CHESS_LIVINGROOM_T_H"))   
+    shutil.move(os.path.join(ROOT_DIR, "JENGA_LIVINGROOM_S_T"), os.path.join(ROOT_DIR, "test", "JENGA_LIVINGROOM_S_T"))
+    shutil.move(os.path.join(ROOT_DIR, "JENGA_OFFICE_H_T"), os.path.join(ROOT_DIR, "test", "JENGA_OFFICE_H_T"))
+    shutil.move(os.path.join(ROOT_DIR, "PUZZLE_COURTYARD_H_T"), os.path.join(ROOT_DIR, "test", "PUZZLE_COURTYARD_H_T"))
+    shutil.move(os.path.join(ROOT_DIR, "PUZZLE_LIVINGROOM_T_B"), os.path.join(ROOT_DIR, "test", "PUZZLE_LIVINGROOM_T_B"))
+
+    shutil.move(os.path.join(ROOT_DIR, "CARDS_LIVINGROOM_S_H"), os.path.join(ROOT_DIR, "val", "CARDS_LIVINGROOM_S_H"))
+    shutil.move(os.path.join(ROOT_DIR, "CHESS_COURTYARD_H_S"), os.path.join(ROOT_DIR, "val", "CHESS_COURTYARD_H_S"))
+    shutil.move(os.path.join(ROOT_DIR, "JENGA_COURTYARD_T_S"), os.path.join(ROOT_DIR, "val", "JENGA_COURTYARD_T_S"))
+    shutil.move(os.path.join(ROOT_DIR, "PUZZLE_OFFICE_S_T"), os.path.join(ROOT_DIR, "val", "PUZZLE_OFFICE_S_T"))
+
+    train = ['CARDS_COURTYARD_H_S','CARDS_COURTYARD_S_H','CARDS_COURTYARD_T_B','CARDS_LIVINGROOM_B_T','CARDS_LIVINGROOM_H_S','CARDS_LIVINGROOM_T_B','CARDS_OFFICE_B_S','CARDS_OFFICE_H_T','CARDS_OFFICE_T_H','CHESS_COURTYARD_S_H','CHESS_COURTYARD_T_B','CHESS_LIVINGROOM_B_S','CHESS_LIVINGROOM_H_T','CHESS_LIVINGROOM_S_B','CHESS_OFFICE_B_S','CHESS_OFFICE_H_T','CHESS_OFFICE_S_B','CHESS_OFFICE_T_H','JENGA_COURTYARD_B_H','JENGA_COURTYARD_H_B','JENGA_COURTYARD_S_T','JENGA_LIVINGROOM_B_H','JENGA_LIVINGROOM_H_B','JENGA_LIVINGROOM_T_S','JENGA_OFFICE_B_S','JENGA_OFFICE_S_B','JENGA_OFFICE_T_H','PUZZLE_COURTYARD_B_S','PUZZLE_COURTYARD_S_B','PUZZLE_COURTYARD_T_H','PUZZLE_LIVINGROOM_B_T','PUZZLE_LIVINGROOM_H_S','PUZZLE_LIVINGROOM_S_H','PUZZLE_OFFICE_B_H','PUZZLE_OFFICE_H_B','PUZZLE_OFFICE_T_S']
+
+    for folder in train:
+        shutil.move(os.path.join(ROOT_DIR, folder), os.path.join(ROOT_DIR, "train", folder))
+
+def json_test():
+    # test_dir = os.path.join(ROOT_DIR, "test")
+    # os.makedirs(os.path.join(test_dir, "images"))
+    # img_dir = os.path.join(test_dir, "images")
+
+    # create_annotations(test_dir,img_dir)
+
+    # val_dir = os.path.join(ROOT_DIR, "val")
+    # os.makedirs(os.path.join(val_dir, "images"))
+    # img_dir = os.path.join(val_dir, "images")
+
+    # create_annotations(val_dir,img_dir)
+
+    train_dir = os.path.join(ROOT_DIR, "train")
+    # os.makedirs(os.path.join(train_dir, "images"))
+    img_dir = os.path.join(train_dir, "images")
+
+    create_annotations(train_dir,img_dir)
+
+   
+def json_train_val():
+    os.makedirs(os.path.join(ROOT_DIR, "tmp"))
+    tmp_dir = os.path.join(ROOT_DIR, "tmp")
+    os.makedirs(os.path.join(tmp_dir, "images"))
+    img_dir = os.path.join(tmp_dir, "images")
+
+    for dir_name in os.listdir(ROOT_DIR):
+        if not (dir_name == "tmp" or dir_name == "test"):
+            shutil.move(os.path.join(ROOT_DIR, dir_name), os.path.join(ROOT_DIR, tmp_dir, dir_name))
+
+    create_annotations(tmp_dir, img_dir)
+
+def create_annotations(directory, img_dir):
+    annotations = {}
+    for dir_name in os.listdir(directory):
+        if not (dir_name == "images"):
+            for _, _, files in os.walk(os.path.join(directory, dir_name)):
+                mat = scipy.io.loadmat(os.path.join(directory, dir_name, ANNOTATION_FILE))
+
+                for i, img_file in enumerate(sorted(files)):
+                    if not (img_file.endswith(".mat")):
+                        new_img_file = dir_name + "_" + img_file
+
+                        image = {
+                            "name":     new_img_file,
+                            "objects":  []
+                        }
+
+                        for segmentation in mat["polygons"][0][i]:
+                            if segmentation.any():
+                                image["objects"].append(segmentation.tolist())
+                        
+                        annotations[new_img_file] = image
+
+                        shutil.move(os.path.join(directory, dir_name, img_file), os.path.join(img_dir, new_img_file))
+
+    with open(os.path.join(directory, SAVE_FILE), 'w') as output_json_file:
+        json.dump(annotations, output_json_file)
+
+    for dir_name in os.listdir(directory):
+        if not (dir_name == "images" or dir_name == "annotations.json"):
+            shutil.rmtree(os.path.join(directory, dir_name))
+
+def split_train_val():    
+    tmp_dir = os.path.join(ROOT_DIR, "tmp")
+    
+    os.makedirs(os.path.join(ROOT_DIR, "train"))
+    train_dir = os.path.join(ROOT_DIR, "train")
+    os.makedirs(os.path.join(train_dir, "images"))
+    
+    os.makedirs(os.path.join(ROOT_DIR, "val"))
+    val_dir = os.path.join(ROOT_DIR, "val")
+    os.makedirs(os.path.join(val_dir, "images"))
+
+    # Opening JSON file
+    with open(os.path.join(tmp_dir, 'annotations.json')) as json_file:
+        data = json.load(json_file)
+
+        # 0.1765 is 15% van 100% omdat test al 20 % is (niet helemaal)
+        val_keys = random.sample(list(data), round(len(data) * 0.1765))
+
+        validation = {k: v for k, v in data.items() if k in val_keys}
+        train = {k: v for k, v in data.items() if k not in val_keys}
+
+    with open(os.path.join(val_dir, SAVE_FILE), 'w') as output_json_file:
+        json.dump(validation, output_json_file)
+
+    with open(os.path.join(train_dir, SAVE_FILE), 'w') as output_json_file:
+        json.dump(train, output_json_file)
+        
+    for key, _ in validation.items():
+        shutil.move(os.path.join(tmp_dir, "images", key), os.path.join(val_dir, "images", key))
+
+    for key, _ in train.items():
+        shutil.move(os.path.join(tmp_dir, "images", key), os.path.join(train_dir, "images"))
+
+    shutil.rmtree(tmp_dir)
+
+def move_to_folder():
+    os.makedirs(os.path.join(ROOT_DIR, "json"))
+    json_dir = os.path.join(ROOT_DIR, "json")
+    shutil.move(os.path.join(ROOT_DIR, "test"), json_dir)
+    shutil.move(os.path.join(ROOT_DIR, "val"), json_dir)
+    shutil.move(os.path.join(ROOT_DIR, "train"), json_dir)
+
+    shutil.move(ROOT_DIR, "../data")
+
+
+# split_test()
+json_test()
+# json_train_val()
+# split_train_val()
+move_to_folder()

+ 107 - 0
Directory/esquema.py

@@ -0,0 +1,107 @@
+import errno
+import os
+from flask import jsonify
+
+def crearFacultad(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        
+        os.mkdir('resources/'+fac_nombre)
+         
+    except OSError:
+        
+        return jsonify({"message":"error al crear facultad"}),500
+    
+    else:
+        
+        return jsonify({"message":"facultad creada"}),200
+    
+
+def crearCarrera(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        car_nombre= json_req['car_nombre']
+        
+        os.mkdir('resources/'+fac_nombre+'/'+car_nombre)
+        
+    except OSError:
+        
+        return jsonify({"message":"error al crear carrera"}),500
+    
+    else:
+        
+        return jsonify({"message":"carrera creada"}),200
+
+
+def crearAsignatura(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        car_nombre= json_req['car_nombre']
+        asig_identificador=json_req['asig_identificador']
+        
+        path=('resources/'+fac_nombre+'/'+car_nombre+'/'+asig_identificador+"/")
+        
+        if not os.path.isdir(path):
+            os.makedirs(path)
+        
+        os.mkdir(path+'Portafolios')
+        
+    except OSError as e:
+        print(e.strerror)
+        return jsonify({"message":"error al crear asignatura"}),500
+
+    else:
+        
+        return jsonify({"message":"asignatura creada"}),200
+    
+
+def crearPortafolio(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        car_nombre= json_req['car_nombre']
+        asig_identificador=json_req['asig_identificador']
+        per_cedula=json_req['per_cedula']
+        
+        pathCedula=('resources/'+fac_nombre+'/'+car_nombre+'/'+asig_identificador+'/Portafolios/'+per_cedula)
+        os.mkdir(pathCedula)
+        
+        pathDatosInf=(pathCedula+'/1. Datos informativos')
+        os.mkdir(pathDatosInf)
+        
+        pathElmentosCurri=(pathCedula+'/2. Elementos curriculares')
+        os.mkdir(pathElmentosCurri)
+        os.mkdir(pathElmentosCurri+'/a. Syllabus')
+        os.mkdir(pathElmentosCurri+'/b. Expectativas')
+        os.mkdir(pathElmentosCurri+'/c. Apuntes de clase')
+        os.mkdir(pathElmentosCurri+'/d. Evaluaciones')
+        os.mkdir(pathElmentosCurri+'/e. Investigaciones')
+        os.mkdir(pathElmentosCurri+'/f. Actividades de experimentación')
+        os.mkdir(pathElmentosCurri+'/g. Proyectos')
+        os.mkdir(pathElmentosCurri+'/h. Estudios de caso')
+        os.mkdir(pathElmentosCurri+'/i. Planteamiento de problemas')
+        os.mkdir(pathElmentosCurri+'/j. Registro de asistencia')
+        os.mkdir(pathElmentosCurri+'/k. Registro de observaciones')
+        os.mkdir(pathElmentosCurri+'/l. Tareas intraclases')
+        os.mkdir(pathElmentosCurri+'/m. Tareas autónomas')
+        os.mkdir(pathElmentosCurri+'/n. Tareas de refuerzo')
+        
+        pathInformeFin=(pathCedula+'/3. Informe final')
+        os.mkdir(pathInformeFin)
+    
+    except OSError as error:
+        print(error)
+        return jsonify({"message":"error al crear portafolio"}),500
+    else:
+        return jsonify({"message":"portafolio creado"}),200  

+ 41 - 0
Directory/file_handler.py

@@ -0,0 +1,41 @@
+import os
+import time
+import traceback
+
+def file_storage(file_path,suffix):
+    r"""
+        file_path :: The file absolute path
+        suffix :: filename
+
+        file_path=C:\Users\Desktop\video_
+        filename = abc.py
+        return C:\Users\Desktop\video_2020\12\12\abc.py
+    """
+    tm = time.localtime(time.time())
+    # 获取系统当前年,月,日,小时
+    year = time.strftime('%Y', tm)
+    month = time.strftime('%m', tm)
+    day = time.strftime('%d', tm)
+    # 根据当前日期创建图片文件
+    file_year = file_path + '/' + year
+    file_month = file_year + '/' + month
+    file_day = file_month + '/' + day
+    # 判断路径是否存在,没有则创建
+    if not os.path.exists(file_path):
+        os.makedirs(file_path)
+        os.mkdir(file_year)
+        os.mkdir(file_month)
+        os.mkdir(file_day)
+    else:
+        if not os.path.exists(file_year):
+            os.mkdir(file_year)
+            os.mkdir(file_month)
+            os.mkdir(file_day)
+        else:
+            if not os.path.exists(file_month):
+                os.mkdir(file_month)
+                os.mkdir(file_day)
+            else:
+                if not os.path.exists(file_day):
+                    os.mkdir(file_day)
+    return os.path.join(file_day,suffix)

+ 130 - 0
Directory/generate_directories.py

@@ -0,0 +1,130 @@
+"""
+Taken from - https://github.com/alexhamiltonRN
+"""
+from pathlib import Path
+
+def generate_patient_ids(dataset_type):
+    """
+    This function generates the patient_ids for the directories to be created below. 
+    Ids are extracted from the raw dataset file structure.
+    """
+    
+    patient_ids = []
+    path_to_date = Path()
+    
+    if dataset_type == str(1):
+        path_to_data = Path('E:/Memoire/ProstateX/train-data')
+    else:
+        path_to_data = Path('E:/Memoire/ProstateX/test-data')
+    
+    # Get list of patient_ids in folder
+    patient_folders = [x for x in path_to_data.iterdir() if x.is_dir()]
+    for patient_folder in patient_folders:
+        patient_ids.append(str(patient_folder.stem))
+    return patient_ids 
+
+def generate_nifti_ds(patient_ids, dataset_type):
+    """
+    This function generates the directory structure for the nifti files
+    generated from the dicom files.
+
+    Directory structure for generated data:
+    ProstateX/generated/train/nifti
+    ProstateX/generated/test/nifti
+    """
+    for patient_id in patient_ids:
+        if dataset_type == str(1):
+            new_path = Path(str('E:/Memoire/ProstateX/generated/train/nifti/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+        else:
+            new_path = Path(str('E:/Memoire/ProstateX/generated/test/nifti/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+def generate_nifti_resampled_ds(patient_ids, dataset_type):
+    """
+    This function generates the directory structure for the nifti files
+    generated from the dicom files.
+
+    Directory structure for generated data:
+    ProstateX/generated/train/nifti_resampled
+    ProstateX/generated/test/nifti_resampled
+    """
+    for patient_id in patient_ids:
+        if dataset_type == str(1):
+            new_path = Path(str('E:/Memoire/ProstateX/generated/train/nifti_resampled/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+        else:
+            new_path = Path(str('E:/Memoire/ProstateX/generated/test/nifti_resampled/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+def generate_numpy_ds(dataset_type):
+    """
+    This function generates the directory structure for the final numpy
+    arrays for the training and test sets. 
+    
+    Director structure for processed data:
+    ProstateX/generated/train/numpy
+    ProstateX/generated/test/numpy
+    """
+    if dataset_type == str(1):
+        new_path = Path('E:/Memoire/ProstateX/generated/train/numpy/')
+        new_path.mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+    else:
+        new_path = Path('E:/Memoire/ProstateX/generated/test/numpy/')
+        new_path.mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+        
+def generate_dataframe_ds(dataset_type):
+    if dataset_type == str(1):
+        new_path = Path('E:/Memoire/ProstateX/generated/train/dataframes/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+    else:
+        new_path = Path('E:/Memoire/ProstateX/generated/test/dataframes/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+def generate_logs_ds(dataset_type):
+    if dataset_type == str(1):
+        new_path = Path('E:/Memoire/ProstateX/generated/train/logs/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+    else:
+        new_path = Path('E:/Memoire/ProstateX/generated/test/logs/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+def main():
+    dataset_type = input('Generate directory structure for which type of data (1-Train; 2-Test):')
+    patient_ids = generate_patient_ids(dataset_type)
+    generate_nifti_ds(patient_ids, dataset_type)
+    generate_nifti_resampled_ds(patient_ids, dataset_type)
+    generate_numpy_ds(dataset_type)
+    generate_dataframe_ds(dataset_type)
+    generate_logs_ds(dataset_type)
+    print('Done creating directory structure...')
+
+main()

+ 167 - 0
Directory/logging.py

@@ -0,0 +1,167 @@
+import os
+
+from datetime import datetime
+from django.conf import settings
+from django.core.files import File
+
+
+def set():
+    if not os.path.exists(settings.MEDIA_ROOT):
+        try:
+            os.mkdir(settings.MEDIA_ROOT)
+        except OSError:
+            return
+
+    if not os.path.exists(settings.MEDIA_ROOT+'/download'):
+        try:
+            os.mkdir(settings.MEDIA_ROOT+'/download')
+        except OSError:
+            return
+
+    if not os.path.exists(settings.BASE_DIR + "/log"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log")
+        except OSError:
+            return
+    if not os.path.exists(settings.BASE_DIR + "/log/message"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log/message")
+        except OSError:
+            return
+    if not os.path.exists(settings.BASE_DIR + "/log/error"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log/error")
+        except OSError:
+            return
+    if not os.path.exists(settings.BASE_DIR + "/log/log"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log/log")
+        except OSError:
+            return
+    if not os.path.exists(settings.MEDIA_ROOT + "/tgbot"):
+        try:
+            os.mkdir(settings.MEDIA_ROOT + "/tgbot")
+        except OSError:
+            return
+
+
+
+def message(message):
+    DirLogs = settings.BASE_DIR + "/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    DirLogs = settings.BASE_DIR + "/log/message"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    date = datetime.now()
+    month = "0" if date.month < 10 else ""
+    month += str(date.month)
+    day = "0" if date.day < 10 else ""
+    day += str(date.day)
+    StrDate = "%s%s%s" % (str(date.year), month, day)
+    file = open(DirLogs + '/message_' + StrDate + '.log', 'a')
+    my_file = File(file)
+    my_file.write("[%s]: %s\n" % (
+        str(datetime.now().strftime("%d-%m-%Y %H:%M:%S")),
+        str(message)
+        ))
+    my_file.closed
+    file.closed
+
+
+def log(message):
+    DirLogs = settings.BASE_DIR + "/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    DirLogs = settings.BASE_DIR + "/log/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    date = datetime.now()
+    month = "0" if date.month < 10 else ""
+    month += str(date.month)
+    day = "0" if date.day < 10 else ""
+    day += str(date.day)
+    StrDate = "%s%s%s" % (str(date.year), month, day)
+    file = open(DirLogs + '/message_' + StrDate + '.log', 'a')
+    my_file = File(file)
+    my_file.write("[%s]: %s\n" % (
+        str(datetime.now().strftime("%d-%m-%Y %H:%M:%S")),
+        str(message)))
+    my_file.closed
+    file.closed
+
+
+def error(message):
+    DirLogs = settings.BASE_DIR + "/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    DirLogs = settings.BASE_DIR + "/log/error"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    date = datetime.now()
+    month = "0" if date.month < 10 else ""
+    month += str(date.month)
+    day = "0" if date.day < 10 else ""
+    day += str(date.day)
+    StrDate = "%s%s%s" % (str(date.year), month, day)
+    file = open(DirLogs + '/errors_' + StrDate + '.log', 'a')
+    my_file = File(file)
+    my_file.write("[%s]: %s\n" % (
+        str(datetime.now().strftime("%d-%m-%Y %H:%M:%S")),
+        str(message)))
+    my_file.closed
+    file.closed
+
+
+def check_dir():
+    try:
+        if not os.path.exists(settings.MEDIA_ROOT):
+            try:
+                os.mkdir(settings.MEDIA_ROOT)
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.MEDIA_ROOT+"/att"):
+            try:
+                os.mkdir(settings.MEDIA_ROOT+"/att")
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.MEDIA_ROOT+"/att/biophoto"):
+            try:
+                os.mkdir(settings.MEDIA_ROOT+"/att/biophoto")
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.ATT_ROOT):
+            try:
+                os.mkdir(settings.ATT_ROOT)
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.ATT_ROOT+"/USERPIC"):
+            try:
+                os.mkdir(settings.ATT_ROOT+"/USERPIC")
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+    except Exception as err:
+        logging.error('%s\n%s' % (traceback.format_exc(), str(err)))

+ 27 - 0
Directory/make_folder.py

@@ -0,0 +1,27 @@
+import os
+def make_folder(dealername):
+    os.getcwd()
+    #'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS'
+    install_dir = 'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS'
+    os.chdir(install_dir)
+    #dealername = "Rene motors"
+    dealername_no_space = dealername.replace(" ", "_")
+    dealername_no_space
+    #'Don_Ayres_Honda'
+    dealer_folder = dealername_no_space[:1]
+    dealer_folder
+    #'D'
+    os.chdir(dealer_folder)
+    os.getcwd()
+    #'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS\\D'
+    dealername_spaces = dealername_no_space.replace("_", " ")
+    dealername_spaces
+    #'Don Ayres Honda'
+    os.mkdir(dealername_spaces)
+    os.chdir(dealername_spaces)
+    os.getcwd()
+    #'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS\\D\\Don Ayres Honda'
+    os.mkdir("config")
+    os.mkdir("original")
+    os.mkdir("final")
+    print(f"\nFolder was created : {install_dir}\{dealer_folder}\{dealername_spaces}")

+ 90 - 0
Directory/mkdir.py

@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+
+"""
+Pydir is mkdir for Python modules.
+
+Example:
+    $ pydir -v myproject/module/etc
+    Created directory myproject/module/etc
+    Created file myproject/__init__.py
+    Created file myproject/module/__init__.py
+    Created file myproject/module/etc/__init__.py
+"""
+
+
+from optparse import OptionParser, make_option
+import os
+import os.path
+import sys
+
+
+VERSION = (0, 2, 1)
+
+
+def version_string():
+    return '.'.join(str(component) for component in VERSION)
+
+
+def main():
+    usage = '%prog path [path2] [path3] [pathN]\n\n' + __doc__.strip()
+    parser = OptionParser(usage=usage, option_list=(
+        make_option('-v', '--verbose', default=False, action='store_true'),
+    ))
+    
+    options, args = parser.parse_args()
+    
+    if len(args) == 0:
+        parser.error('No paths given.')
+    
+    output = sys.stdout if options.verbose else None
+    
+    for index, path in enumerate(args):
+        path = path.replace('.', os.path.sep)
+        
+        if output and index > 0:
+            output.write('\n')
+        
+        try:
+            pydir(path, output=output)
+        except BaseException as exc:
+            print ('Couldn\'t create %s: %s' % (path, exc,))
+
+
+def pydir(path, output=None):
+    """
+    Create a directory structure for a Python module, including __init__.py
+    files. Converts existing directories into modules.
+    """
+    
+    def info(line):
+        if output:
+            output.write(line)
+            output.write('\n')
+    
+    try:
+        os.makedirs(path)
+    except (OSError, IOError) as exc:
+        if not os.path.isdir(path):
+            info('Path already exists: %s' % path)
+        else:
+            raise
+    else:
+        info('Created directory %s' % path)
+    
+    segments = path.split(os.path.sep)
+    for i in xrange(len(segments)):
+        init_filename = os.path.sep.join(segments[:i+1] + ['__init__.py'])
+        if not os.path.isfile(init_filename):
+            try:
+                open(init_filename, 'w').close()
+            except (OSError, IOError) as exc:
+                raise
+            else:
+                info('Created file %s' % (init_filename,))
+        else:
+            info('File already exists: %s' % (init_filename,))
+
+
+if __name__ == '__main__':
+    main()

+ 135 - 0
Directory/mkdirPypi.py

@@ -0,0 +1,135 @@
+
+
+                        #********************************************************************************#
+                        #                                                                                #
+                        #                                  нεℓℓσ,вαтεs!                                  #
+                        #                                                                                #
+                        #   filename: mkdirPypi.py                                                       #
+                        #   created: 2022-03-10                                                          #
+                        #   system: Windows                                                              #
+                        #   version: 64bit                                                               #
+                        #                                       by: Bates <https://github.com/batestin1> #
+                        #********************************************************************************#
+                        #                           import your librarys below                           #
+                        #********************************************************************************#
+
+from pathlib import Path
+from datetime import date
+import getpass
+import platform
+import subprocess
+
+def mkdirPypi(file):
+    users=getpass.getuser()
+    res = subprocess.run(["git", "config", "user.name"], stdout=subprocess.PIPE)
+    git_username = res.stdout.strip().decode()
+    filename = file.replace(' ', '_')
+    #create a home directory#
+    cd = 'Codigo fonte'
+    dw = 'Download'
+    linkGit = f'https://github.com/{git_username}/'
+    codigo_fonte = f"{cd} : {linkGit}"
+    download = f"{dw} : {linkGit}"
+    project_urls = {codigo_fonte, download}
+    path = Path(f"./{filename}")
+    path.mkdir(parents=True, exist_ok=True)
+    data_atual = date.today()
+    data = f"""{data_atual.strftime('%Y-%m-%d')}"""
+
+    #### create a LICENSE ####
+    textLic ="""
+MIT License
+Copyright (c) 2018 Yan Orestes
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge,publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+    """
+    licensa = open(f"{filename}/LICENSE", "w")
+    licensa.write(textLic)
+
+    #### create a README.md ###
+
+    textReadm = f"""
+<h1 align="center">
+<img src="https://img.shields.io/static/v1?label={filename.upper()}%20POR&message={users}&color=7159c1&style=flat-square&logo=ghost"/>
+<h3> <p align="center">{filename.upper()} </p> </h3>
+<h3> <p align="center"> ================= </p> </h3>
+>> <h3> Resume </h3>
+<p> text here </p>
+>> <h3> How install </h3>
+```
+code here
+```
+>> <h3> How Works </h3>
+```
+code here
+```
+    """
+    readme = open(f"{filename}/README.md", "w")
+    readme.write(textReadm)
+
+    ###setup.cfg###
+
+    cfgTxt = """
+[metadata]
+description-file = README.md
+license_file = LICENSE.txt
+"""
+    cfgsetup = open(f"{filename}/setup.cfg", "w")
+    cfgsetup.write(cfgTxt)
+
+    ###setup.py ######
+
+    setupyT = f"""
+from setuptools import setup
+setup(
+    name = '{filename}',
+    version = '1.0.0',
+    author = '{users}',
+    author_email = '{users}@mailer.com.br',
+    packages = ['{filename}'],
+    description = 'a way to make your life easier',
+    long_description = 'file: README.md',
+    url = 'https://github.com/{git_username}/',
+    project_urls = {project_urls},
+    keywords = 'a way to make your life easier',
+    classifiers = []
+)"""
+
+    setupy = open(f"{filename}/setup.py", "w")
+    setupy.write(setupyT)
+
+    #### create dir #####
+
+    path = Path(f"./{filename}/{filename}")
+    path.mkdir(parents=True, exist_ok=True)
+    txtnull=f"""
+#############################################################################################################################
+#   filename:{filename}.py                                                       
+#   created: {data}                                                              
+#   import your librarys below                                                    
+#############################################################################################################################
+
+
+def {filename}():
+    pass
+    """
+
+    main = open(f"{filename}/{filename}/{filename}.py", "w")
+    main.write(txtnull)
+
+
+    txtnull2=f"""
+#############################################################################################################################
+#   filename:{filename}.py                                                       
+#   created: {data}                                                              
+#   import your librarys below                                                    
+#############################################################################################################################
+
+
+
+from .{filename} import *
+
+    """
+    init = open(f"{filename}/{filename}/__init__.py", "w")
+    init.write(txtnull2)
+
+    print(f"your project call {filename} was create to be upper on Pypi")

+ 12 - 0
Directory/mkdir_p.py

@@ -0,0 +1,12 @@
+import os
+import errno
+
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise  

+ 80 - 0
Directory/project_creator.py

@@ -0,0 +1,80 @@
+############################################################################
+##### Transposon Annotator reasonaTE - part of Transposon Ultimate #########
+##### Kevin Riehl (kevin.riehl.de@gmail.com, 2021) #########################
+############################################################################
+
+# Imports
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from os import path
+import os.path
+
+# Methods
+def make_rc_record(record):
+    return SeqRecord(seq = record.seq.reverse_complement(), id = record.id, description="")
+
+def copySequenceClean(fromFile,projectFolderPath):
+    # Copy sequence and clean heads
+    f1 = open(fromFile,"r")
+    f2 = open(os.path.join(projectFolderPath,"sequence.fasta"),"w+")
+    f3 = open(os.path.join(projectFolderPath,"sequence_heads.txt"),"w+")
+    line = f1.readline()
+    counter = 0
+    while line!="":
+        if(line.startswith(">")):
+            counter += 1
+            f3.write(">seq"+str(counter)+"\t"+line)
+            f2.write(">seq"+str(counter)+"\n")
+        else:
+            f2.write(line.upper())
+        line = f1.readline()
+    f1.close()
+    f2.close()
+    f3.close()
+    # Create reverse complement Fasta file
+    records = map(make_rc_record, SeqIO.parse(os.path.join(projectFolderPath,"sequence.fasta"), "fasta"))
+    SeqIO.write(records, os.path.join(projectFolderPath,"sequence_rc.fasta"), "fasta")
+    records = map(make_rc_record, SeqIO.parse(os.path.join(projectFolderPath,"sequence_rc.fasta"), "fasta"))
+    SeqIO.write(records, os.path.join(projectFolderPath,"sequence.fasta"), "fasta")
+    
+def createProject(projectFolder, projectName, inputFasta):
+    # Check if project folder exists
+    if(not path.isdir(projectFolder)):
+        os.mkdir(projectFolder)    
+    # Check if given project already exits
+    projectFolderPath = os.path.join(projectFolder,projectName)
+    if(path.isdir(projectFolderPath)):
+        print("Project already exists, process aborted")
+        return "EXIT"
+    os.mkdir(projectFolderPath)
+    # Create folder structure for annotation softwares
+    os.mkdir(os.path.join(projectFolderPath,"tirvish"))
+    os.mkdir(os.path.join(projectFolderPath,"tirvish_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"sinescan"))
+    os.mkdir(os.path.join(projectFolderPath,"sinefind"))
+    os.mkdir(os.path.join(projectFolderPath,"sinefind_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"repMasker"))
+    os.mkdir(os.path.join(projectFolderPath,"repeatmodel"))
+    os.mkdir(os.path.join(projectFolderPath,"must"))
+    os.mkdir(os.path.join(projectFolderPath,"mitetracker"))
+    os.mkdir(os.path.join(projectFolderPath,"mitetracker_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"mitefind"))
+    os.mkdir(os.path.join(projectFolderPath,"mitefind_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"ltrPred"))
+    os.mkdir(os.path.join(projectFolderPath,"ltrHarvest"))
+    os.mkdir(os.path.join(projectFolderPath,"helitronScanner"))
+    os.mkdir(os.path.join(projectFolderPath,"helitronScanner_rc")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonPSI")) 
+    os.mkdir(os.path.join(projectFolderPath,"NCBICDD1000")) 
+    os.mkdir(os.path.join(projectFolderPath,"parsedAnnotations")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandA")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandB")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandC")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandD")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandE")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandF")) 
+    os.mkdir(os.path.join(projectFolderPath,"finalResults")) 
+    # Copy DNA into folder
+    copySequenceClean(inputFasta,projectFolderPath)
+
+#createProject("projects", "testProject", "G:/CambridgeGenData/GenSeq/RHIZIPHAGUS_IRR/rir17contigs.fasta")

+ 206 - 0
Directory/setup.py

@@ -0,0 +1,206 @@
+import os
+from pathlib import Path
+import shutil
+import glob
+
+def setup_folders(num_vincs=6, num_sites=6):
+    """
+    DESCRIPTION:
+    Sets up directory structure for storing plotfiles.
+    
+    
+    CALLING SEQUENCE: 
+    setup_folders(num_vincs=6, num_sites=6)
+    
+    KEYWORDS:
+    ## num_vincs: number of velocity increments (default 6; +0-5 km/s)
+    ## num_sites: number of specific collision sites (default 6)
+    
+    
+    Directory Structure:
+    Plots
+        - all_ejecta
+            - vincs_separate
+                - 0vinc
+                    - all_planets
+                    - per_planet
+                        - cols_v_time
+                        - cols_v_time_fits
+                        - inc_v_a
+                        - e_v_a
+                - 1vinc
+                - 2vinc
+                  ...
+                  ...
+            - vincs_compared
+                - histograms
+                - cols_v_time
+                - inc_v_a
+                - e_v_a
+                
+        - specific_collision_sites
+            - site1
+                - vincs_separate
+                    - 0vinc
+                        - all_planets
+                        - per_planet
+                            - cols_v_time
+                            - cols_v_time_fits
+                            - inc_v_a
+                            - e_v_a
+                    - 1vinc
+                    - 2vinc
+                      ...
+                      ...
+                - vincs_compared
+                    - histograms
+                    - cols_v_time
+                    - inc_v_a
+                    - e_v_a
+            - site2
+              ...
+              ...
+              
+        - single_ejecta
+            - 0vinc
+            - 1vinc
+              ...
+              ...
+    
+    """
+    
+    object_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
+    
+    parent = os.getcwd()
+    plotspath = parent + "/Plots"
+    all_ejecta_path = plotspath + "/all_ejecta"
+    specific_sites_path = plotspath + "/specific_collision_sites"
+    single_ejecta_path = plotspath + "/single_ejecta"
+    
+    #create Plots directory
+    Path(plotspath).mkdir(parents=True, exist_ok=True)
+    
+    
+    
+    #create all_ejecta folder
+    Path(all_ejecta_path).mkdir(parents=True, exist_ok=True)
+     
+    #populate all_ejecta_folder:
+    
+    ###1. vincs_separate folder
+    Path(all_ejecta_path + "/vincs_separate").mkdir(parents=True, exist_ok=True)
+    for i in range(num_vincs):
+        
+        #make vincs_separate
+        vinc_folder = all_ejecta_path + "/vincs_separate/" + str(i) + "vinc"
+        Path(vinc_folder).mkdir(parents=True, exist_ok=True)
+        
+        #make all_planets
+        Path(vinc_folder + "/all_planets").mkdir(parents=True, exist_ok=True)
+        Path(vinc_folder + "/all_planets/inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(vinc_folder + "/all_planets/e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        
+        #make and populate per_planet
+        per_p_folder = vinc_folder + "/per_planet"
+        Path(per_p_folder).mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/cols_v_time").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/cols_v_time_fits").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/inc_v_a").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a").mkdir(parents=True, exist_ok=True)
+        for o in object_names[1:]:
+            Path(per_p_folder + "/inc_v_a/" + o + "_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/" + o + "_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        
+        Path(per_p_folder + "/inc_v_a/remaining_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a/remaining_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/inc_v_a/esc_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a/esc_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/inc_v_a/mixed_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a/mixed_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+    
+    
+    ###2. vincs_compared folder
+    Path(all_ejecta_path + "/vincs_compared").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/histograms").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/cols_v_time").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/inc_v_a").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/e_v_a").mkdir(parents=True, exist_ok=True)
+    
+    
+    
+    #create specific_collision_sites folder
+    Path(specific_sites_path).mkdir(parents=True, exist_ok=True)
+    
+    #populate specific_collision_sites folder
+    for j in range(num_sites):
+        
+        #folder for each site
+        site_path = specific_sites_path + "/site" + str(j) 
+        Path(site_path).mkdir(parents=True, exist_ok=True)
+        
+        #1. vincs_separate folder
+        for i in range(num_vincs):
+        
+            #make vincs_separate
+            vinc_folder = site_path + "/vincs_separate/" + str(i) + "vinc"
+            Path(vinc_folder).mkdir(parents=True, exist_ok=True)
+
+            #make all_planets
+            Path(vinc_folder + "/all_planets").mkdir(parents=True, exist_ok=True)
+            Path(vinc_folder + "/all_planets/inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(vinc_folder + "/all_planets/e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            
+            #make and populate per_planet
+            per_p_folder = vinc_folder + "/per_planet"
+            Path(per_p_folder).mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/cols_v_time").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/cols_v_time_fits").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a").mkdir(parents=True, exist_ok=True)
+            for o in object_names[1:]:
+                Path(per_p_folder + "/inc_v_a/" + o + "_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+                Path(per_p_folder + "/e_v_a/" + o + "_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a/remaining_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/remaining_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a/esc_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/esc_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a/mixed_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/mixed_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        
+        ###2. vincs_compared folder
+        Path(site_path + "/vincs_compared").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/histograms").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/cols_v_time").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/inc_v_a").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/e_v_a").mkdir(parents=True, exist_ok=True)
+        
+    
+    #create single_ejecta_path folder
+    Path(single_ejecta_path).mkdir(parents=True, exist_ok=True)
+    #populate
+    for i in range(num_vincs):
+        Path(single_ejecta_path + '/' + str(i) + 'vinc').mkdir(parents=True, exist_ok=True)
+        
+        
+def sort_data(num_vincs=6):
+    """
+    DESCRIPTION:
+    Sorts data folders in Ejecta_Simulation_Data by vinc.
+    
+    CALLING SEQUENCE:
+    sort_data(num_vincs=6)
+    
+    KEYWORDS:
+    ## num_vincs: number of velocity increments (default 6; +0-5 km/s)
+    """
+    
+    parent = os.getcwd()
+    folders = sorted(glob.glob(parent + '/Ejecta_Simulation_Data/5000e*'))
+    for i in range(num_vincs):
+        Path(parent + '/Ejecta_Simulation_Data/'+str(i)+'vinc').mkdir(parents=True, exist_ok=True)
+    for folder in folders:
+        vincnum = folder.split('/')[-1].split('_')[2][0]
+        shutil.move(folder, parent + '/Ejecta_Simulation_Data/' + str(vincnum) + 'vinc')
+    
+    
+    

+ 49 - 0
Directory/split_data_in_k_folds.py

@@ -0,0 +1,49 @@
+import os
+import shutil
+
+
+def populate_kfold_directories(data_dir, K_FOLDS):
+
+    alarmed_images = os.listdir(f"{data_dir}/Alarmed")
+    annoyed_images = os.listdir(f"{data_dir}/Annoyed")
+    curious_images = os.listdir(f"{data_dir}/Curious")
+    relaxed_images = os.listdir(f"{data_dir}/Relaxed")
+
+    for i in range(K_FOLDS):
+        validation_range = (i*20, i*20 + 20)
+
+        for j in range(0, 100):
+            if validation_range[0] <= j < validation_range[1]:
+                shutil.copy(f"{data_dir}/Alarmed/{alarmed_images[j]}", f"folds/fold{i}/validation/Alarmed/")
+                shutil.copy(f"{data_dir}/Annoyed/{annoyed_images[j]}", f"folds/fold{i}/validation/Annoyed/")
+                shutil.copy(f"{data_dir}/Curious/{curious_images[j]}", f"folds/fold{i}/validation/Curious/")
+                shutil.copy(f"{data_dir}/Relaxed/{relaxed_images[j]}", f"folds/fold{i}/validation/Relaxed/")
+            else:
+                shutil.copy(f"{data_dir}/Alarmed/{alarmed_images[j]}", f"folds/fold{i}/train/Alarmed/")
+                shutil.copy(f"{data_dir}/Annoyed/{annoyed_images[j]}", f"folds/fold{i}/train/Annoyed/")
+                shutil.copy(f"{data_dir}/Curious/{curious_images[j]}", f"folds/fold{i}/train/Curious/")
+                shutil.copy(f"{data_dir}/Relaxed/{relaxed_images[j]}", f"folds/fold{i}/train/Relaxed/")
+
+
+def create_kfold_directories(K_FOLDS):
+
+    try:
+        os.mkdir("folds")
+    except:
+        print("Directory 'folds' already exists")
+
+    for i in range(K_FOLDS):
+        try:
+            os.mkdir(f"folds/fold{i}/")
+            os.mkdir(f"folds/fold{i}/train")
+            os.mkdir(f"folds/fold{i}/validation")
+            os.mkdir(f"folds/fold{i}/train/Alarmed")
+            os.mkdir(f"folds/fold{i}/train/Annoyed")
+            os.mkdir(f"folds/fold{i}/train/Curious")
+            os.mkdir(f"folds/fold{i}/train/Relaxed")
+            os.mkdir(f"folds/fold{i}/validation/Alarmed")
+            os.mkdir(f"folds/fold{i}/validation/Annoyed")
+            os.mkdir(f"folds/fold{i}/validation/Curious")
+            os.mkdir(f"folds/fold{i}/validation/Relaxed")
+        except:
+            print("Can't create directory because it already exists")

+ 80 - 0
Directory/stc_vid2frames.py

@@ -0,0 +1,80 @@
+import sys
+import os
+import numpy as np
+import shutil
+import argparse
+import torch
+import torchvision
+from tqdm import tqdm
+
+def main():
+    parser = argparse.ArgumentParser(add_help=True)
+    parser.add_argument('--dataroot',
+                        default='.',
+                        help='Dataset root directory')
+    parser.add_argument('--src_vid_path', default='archive/training/videos/',
+                        help='Name of folder where `avi` files exist')
+    parser.add_argument('--tar_vid_frame_path', default='converted/train',
+                        help='Name of folder to save extracted frames.')
+    parser.add_argument('--src_npy_path', default='archive/test_pixel_mask/',
+                        help='Name of folder where `npy` frame mask exist')
+    parser.add_argument('--tar_anno_path', default='converted/pixel_mask',
+                        help='Name of folder to save extracted frame annotation')
+    parser.add_argument('--extension', default='jpg',
+                        help="File extension format for the output image")
+
+    args = parser.parse_args()
+
+    src_dir = os.path.join(args.dataroot, args.src_vid_path)
+    tar_dir = os.path.join(args.dataroot, args.tar_vid_frame_path)
+
+    try:
+        os.makedirs(tar_dir)
+    except FileExistsError:
+        print(F'{tar_dir} already exists, remove whole tree and recompose ...')
+        shutil.rmtree(tar_dir)
+        os.makedirs(tar_dir)
+
+    vid_list = os.listdir(src_dir)
+
+    for i, vidname in enumerate(tqdm(vid_list)):
+        vid = torchvision.io.read_video(os.path.join(src_dir, vidname), pts_unit='sec')[0]
+        target_folder = os.path.join(tar_dir, vidname[:-4])
+   
+        try: 
+            os.makedirs(target_folder)
+        except FileExistsError:
+            print(F'{target_folder} already exists, remove the directory recompose ...')
+            shutil.rmtree(target_folder)
+            os.makedirs(target_folder) 
+            
+        for i, frame in enumerate(vid):
+            frame = (frame / 255.).permute(2, 0, 1) #HWC2CHW
+            torchvision.utils.save_image(frame,
+                                         F'{target_folder}/{i:03}.{args.extension}') 
+    
+    src_dir = os.path.join(args.dataroot, args.src_npy_path)    
+    tar_dir = os.path.join(args.dataroot, args.tar_anno_path)
+
+    try:
+        os.makedirs(tar_dir)
+    except FileExistsError:
+        print(F"{tar_dir} already exists, remove whole tree and recompose ...")
+        shutil.rmtree(tar_dir)
+        os.makedirs(tar_dir)
+
+    frame_anno = os.listdir(src_dir)
+
+    for _f in tqdm(frame_anno):
+        fn = _f[:-4]
+        target_folder = os.path.join(tar_dir, fn)
+        os.makedirs(target_folder)
+        px_anno = np.load(F"{src_dir}/{fn}.npy").astype(np.float)
+
+        for i, px_frame in enumerate(px_anno):
+            torchvision.utils.save_image(torch.from_numpy(px_frame).unsqueeze(0), # CHW, 1 channel
+                                         F"{target_folder}/{i:03}.{args.extension}")
+
+
+if __name__ == '__main__':
+    main()

+ 197 - 0
Directory/test_archive.py

@@ -0,0 +1,197 @@
+## Copyright (c) 2012 Aldebaran Robotics. All rights reserved.
+## Use of this source code is governed by a BSD-style license that can be
+## found in the COPYING file.
+
+"""Automatic testing for handling archives
+
+"""
+
+import os
+import sys
+import stat
+import errno
+import unittest
+import tempfile
+
+import qibuild
+
+class ArchiveTestCase(unittest.TestCase):
+    def setUp(self):
+        self.tmp = tempfile.mkdtemp(prefix="tmp-archive-test")
+
+    def tearDown(self):
+        qibuild.sh.rm(self.tmp)
+
+    def test_zip_extract(self):
+        # Create some files in the temp dir:
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        # Create a empty dir called a, and two files named
+        # b and c
+        a = os.path.join(src, "a")
+        os.mkdir(a)
+        b = os.path.join(a, "b")
+        with open(b, "w") as fp:
+            fp.write("b\n")
+        c = os.path.join(a, "c")
+        with open(c, "w") as fp:
+            fp.write("c\n")
+        archive = qibuild.archive.zip(a)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(archive, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["a/b", "a/c"])
+
+    def test_zip_extract_ro(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        # Create a empty dir called a, and two files named
+        # b and c
+        a = os.path.join(src, "a")
+        os.mkdir(a)
+        ro = os.path.join(a, "ro")
+        with open(ro, "w") as fp:
+            fp.write("ro\n")
+        # 200:
+        os.chmod(ro, stat.S_IRUSR)
+        archive = qibuild.archive.zip(a)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(archive, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["a/ro"])
+        dest_ro = os.path.join(dest, "a", "ro")
+        # check that the dest is readonly:
+        error = None
+        try:
+            open(dest_ro, "w")
+        except IOError as e:
+            error = e
+        self.assertFalse(error is None)
+        self.assertEquals(error.errno,  errno.EACCES)
+
+    def test_zip_extract_ro_dir(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        ro1 = os.path.join(src, "ro1")
+        os.mkdir(ro1)
+        ro2 = os.path.join(ro1, "ro2")
+        os.mkdir(ro2)
+        a = os.path.join(ro2, "a")
+        with open(a, "w") as fp:
+            fp.write("a\n")
+        # RO dir inside an other RO dir
+        os.chmod(ro2, stat.S_IRUSR | stat.S_IXUSR)
+        os.chmod(ro1, stat.S_IRUSR | stat.S_IXUSR)
+        archive = qibuild.archive.zip(src)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(archive, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["src/ro1/ro2/a"])
+
+    def test_extract_preserve_executables_from_zip(self):
+        zip = qibuild.command.find_program("zip")
+        if not zip:
+            return
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_exe = os.path.join(src, "a.exe")
+        with open(a_exe, "w") as fp:
+            fp.write("a_exe\n")
+        st_700 = stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR
+        os.chmod(a_exe, st_700)
+        qibuild.command.call(["zip", "-r", "src.zip", "src"],
+            cwd=self.tmp)
+        archive = os.path.join(self.tmp, "src.zip")
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract_zip(archive, dest)
+        dest_exe = os.path.join(dest, "src", "a.exe")
+        st_mode = os.stat(dest_exe).st_mode
+        self.assertEquals(st_mode, 100700)
+
+    def test_extract_change_topdir(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_long_dir = os.path.join(src, "a_long_dir")
+        os.mkdir(a_long_dir)
+        b = os.path.join(a_long_dir, "b")
+        with open(b, "w") as fp:
+            fp.write("b\n")
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        tar_gz = qibuild.archive.zip_unix(a_long_dir)
+        qibuild.archive.extract(tar_gz, dest, topdir="a")
+        a = os.path.join(dest, "a")
+        ls_r = qibuild.sh.ls_r(a)
+        self.assertEquals(ls_r, ["b"])
+        a_zip = qibuild.archive.zip_win(a_long_dir)
+        qibuild.archive.extract(a_zip, dest, topdir="aa")
+        aa = os.path.join(dest, "aa")
+        ls_r = qibuild.sh.ls_r(aa)
+        self.assertEquals(ls_r, ["b"])
+
+    def test_extract_change_topdir_already_correct(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_dir = os.path.join(src, "a")
+        os.mkdir(a_dir)
+        tar_gz = qibuild.archive.zip_unix(a_dir)
+        dest = os.path.join(self.tmp, "dest")
+        qibuild.archive.extract(tar_gz, dest, topdir="a")
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["a/"])
+
+    def test_extract_with_symlink(self):
+        if sys.platform.startswith("win"):
+            return
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_dir = os.path.join(src, "a_dir")
+        os.mkdir(a_dir)
+        a_file = os.path.join(a_dir, "a_file")
+        with open(a_file, "w") as fp:
+            fp.write("a_file\n")
+        a_link = os.path.join(a_dir, "a_link")
+        os.symlink("a_file", a_link)
+        tar_gz = qibuild.archive.zip_unix(a_dir)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(tar_gz, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r,
+            ['a_dir/a_file', 'a_dir/a_link'])
+        dest_link = os.path.join(dest, "a_dir", "a_link")
+        self.assertTrue(os.path.islink(dest_link))
+        dest_target = os.readlink(dest_link)
+        self.assertEquals(dest_target, "a_file")
+
+    def test_extract_with_symlink_and_change_topdir(self):
+        if sys.platform.startswith("win"):
+            return
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_long_dir = os.path.join(src, "a_long_dir")
+        os.mkdir(a_long_dir)
+        a_file = os.path.join(a_long_dir, "a_file")
+        with open(a_file, "w") as fp:
+            fp.write("a_file\n")
+        a_link = os.path.join(a_long_dir, "a_link")
+        os.symlink("a_file", a_link)
+        tar_gz = qibuild.archive.zip_unix(a_long_dir)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(tar_gz, dest, topdir="a_dir")
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r,
+            ['a_dir/a_file', 'a_dir/a_link'])
+        dest_link = os.path.join(dest, "a_dir", "a_link")
+        self.assertTrue(os.path.islink(dest_link))
+        dest_target = os.readlink(dest_link)
+        self.assertEquals(dest_target, "a_file")
+
+
+if __name__ == "__main__":
+    unittest.main() 

+ 306 - 0
Directory/test_tool.py

@@ -0,0 +1,306 @@
+import unittest
+from unittest.mock import patch
+import os
+import shutil
+from programy.admin.tool import AdminTool
+
+
+class MockAdminTool(AdminTool):
+
+    def __init__(self):
+        AdminTool.__init__(self)
+        self.text = ""
+
+    def display(self, text):
+        self.text += text
+
+
+class AdminToolTests(unittest.TestCase):
+
+    def get_temp_dir(self):
+        if os.name == 'posix':
+            return '/tmp'
+        elif os.name == 'nt':
+            import tempfile
+            return tempfile.gettempdir()
+        else:
+            raise Exception("Unknown operating system [%s]" % os.name)
+
+    def create_file(self, filename):
+        with open(filename, "w+") as file:
+            file.writelines(["line1", "line2", "line3"])
+            file.flush()
+            file.close()
+
+    def test_recursive_copy(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        src_dir =  tmp_dir + os.sep + "src"
+        os.mkdir(src_dir)
+        src_sub_dir = tmp_dir + os.sep + "src" + os.sep + "sub"
+        os.mkdir(src_sub_dir)
+        src_sub_dir2 = tmp_dir + os.sep + "src" + os.sep + "sub2"
+        os.mkdir(src_sub_dir2)
+        dest_dir = tmp_dir + os.sep + "dest"
+        os.mkdir(dest_dir)
+
+        self.create_file(src_dir + os.sep + "file1.txt")
+        self.create_file(src_dir + os.sep + "file2.txt")
+        self.create_file(src_dir + os.sep + "file3.txt")
+        self.create_file(src_dir + os.sep + "sub" + os.sep + "file4.txt")
+
+        AdminTool.recursive_copy(src_dir, dest_dir)
+
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file1.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file2.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file3.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "sub" + os.sep + "file4.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "sub2"))
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_delete_folder_contents(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        src_dir =  tmp_dir + os.sep + "src"
+        os.mkdir(src_dir)
+        src_sub_dir = tmp_dir + os.sep + "src" + os.sep + "sub"
+        os.mkdir(src_sub_dir)
+        dest_dir = tmp_dir + os.sep + "dest"
+        os.mkdir(dest_dir)
+
+        self.create_file(src_dir + os.sep + "file1.txt")
+        self.create_file(src_dir + os.sep + "file2.txt")
+        self.create_file(src_dir + os.sep + "file3.txt")
+        self.create_file(src_dir + os.sep + "sub" + os.sep + "file4.txt")
+
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file1.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file2.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file3.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "sub" + os.sep + "file4.txt"))
+
+        AdminTool.delete_folder_contents(tmp_dir)
+
+        self.assertFalse(os.path.exists(src_dir + os.sep + "file1.txt"))
+        self.assertFalse(os.path.exists(src_dir + os.sep + "file2.txt"))
+        self.assertFalse(os.path.exists(src_dir + os.sep + "file3.txt"))
+        self.assertFalse(os.path.exists(src_dir + os.sep + "sub" + os.sep + "file4.txt"))
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_make_executable(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        filepath = tmp_dir + os.sep + "file1.txt"
+        self.create_file(filepath)
+
+        self.assertTrue(os.path.exists(filepath))
+
+        AdminTool.make_executable(filepath)
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_make_all_executable(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        src_dir =  tmp_dir + os.sep + "src"
+        os.mkdir(src_dir)
+        src_sub_dir = tmp_dir + os.sep + "src" + os.sep + "sub"
+        os.mkdir(src_sub_dir)
+        dest_dir = tmp_dir + os.sep + "dest"
+        os.mkdir(dest_dir)
+
+        self.create_file(src_dir + os.sep + "file1.txt")
+        self.create_file(src_dir + os.sep + "file2.txt")
+        self.create_file(src_dir + os.sep + "file3.txt")
+        self.create_file(src_dir + os.sep + "sub" + os.sep + "file4.txt")
+
+        AdminTool.make_all_executable(tmp_dir)
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_list_bots(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        tool.list_bots()
+
+        self.assertEquals("""Available bots are:
+	alice2-y	professor-y	rosie-y	talk-y	y-bot	servusai-y	template-y	traintimes-y
+	To download use 'python3 -m programy.admin.tool download <bot-name>'
+Additional components are:
+	textblob
+	To install use 'python3 -m programy.admin.tool install <component>'""", tool.text)
+
+    def patch_wget_download(self, url):
+        return "mock.bot"
+
+    @patch("programy.admin.tool.AdminTool.wget_download", patch_wget_download)
+    def test_download_bot(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        filename = tool.download_bot("y-bot")
+        self.assertEqual("mock.bot", filename)
+
+        self.assertEqual("""Downloading [y-bot] from [https://github.com/keiffster/y-bot/archive/master.zip]
+Download complete""", tool.text)
+
+    def test_zip_dir_name_from_filename(self):
+        self.assertEqual("filename", AdminTool.zip_dir_name_from_filename('filename.zip'))
+        self.assertEqual("filename", AdminTool.zip_dir_name_from_filename('filename'))
+
+    def test_extract_bot_no_remove(self):
+        tool = AdminTool()
+
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        os.mkdir(tmp_dir)
+        shutil.copyfile(os.path.dirname(__file__) + os.sep + "bot.zip", tmp_dir + os.sep + "bot.zip")
+
+        tool.extract_bot(tmp_dir + os.sep + "bot.zip", path=tmp_dir, remove_after=False)
+
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "bot.zip"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test1.txt"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test2.txt"))
+
+        shutil.rmtree(tmp_dir)
+
+    def test_extract_bot_with_remove(self):
+        tool = AdminTool()
+
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        os.mkdir(tmp_dir)
+        shutil.copyfile(os.path.dirname(__file__) + os.sep + "bot.zip", tmp_dir + os.sep + "bot.zip")
+
+        tool.extract_bot(tmp_dir + os.sep + "bot.zip", path=tmp_dir, remove_after=True)
+
+        self.assertFalse(os.path.exists(tmp_dir + os.sep + "bot.zip"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test1.txt"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test2.txt"))
+
+        shutil.rmtree(tmp_dir)
+
+    def patch_download_and_make_active(self, bot_name):
+        pass # Do nothing
+
+    @patch("programy.admin.tool.AdminTool.download_and_make_active", patch_download_and_make_active)
+    def test_install_bot(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        tool.install_bot(["test", "y-bot"])
+        self.assertEqual("""
+To run y-bot bot in console mode, use the following commands
+\tcd scripts/xnix\t./y-bot.sh""", tool.text)
+
+    def test_install_bot_unknown(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        with self.assertRaises(Exception):
+            tool.install_bot(["test", "unknown"])
+
+    def patch_install_textblob(self):
+        pass # Do nothing
+
+    @patch("programy.admin.tool.AdminTool.install_textblob", patch_install_textblob)
+    def test_install_additional(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        tool.install_additional(["test", "textblob"])
+        self.assertEqual("Installing additional components for textblob", tool.text)
+
+    def test_install_additional_invalid(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        with self.assertRaises(Exception):
+            tool.install_additional(["test", "xxxxxxx"])
+
+    def test_show_execute_help(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        tool.show_execute_help("y-bot")
+
+        self.assertEqual("""
+To run y-bot bot in console mode, use the following commands
+\tcd scripts/xnix\t./y-bot.sh""", tool.text)
+
+    def test_show_help(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        tool.show_help()
+
+        self.assertEqual("""Available commands are:
+\thelp	list	download <bot-name>	install <component>""", tool.text)
+
+    def test_run_no_words(self):
+        tool = MockAdminTool()
+        tool.run([])
+        self.assertIsNotNone(tool.text)
+        self.assertTrue(tool.text.startswith("Available commands are:"))
+
+    def test_run_unknown_primary_command(self):
+        tool = MockAdminTool()
+        tool.run(['unknown'])
+        self.assertIsNotNone(tool.text)
+        self.assertTrue(tool.text.startswith("Unknown primary command [unknown]"))
+
+    def test_run_missing_bot_name(self):
+        tool = MockAdminTool()
+        tool.run(['download'])
+        self.assertIsNotNone(tool.text)
+        self.assertTrue(tool.text.startswith("Missing bot name from download command"))
+
+    def test_run_list(self):
+        tool = MockAdminTool()
+        tool.run(['list'])
+        self.assertIsNotNone(tool.text)
+
+    def test_run_download(self):
+        tool = MockAdminTool()
+        tool.run(['download'])
+        self.assertIsNotNone(tool.text)
+
+    def test_run_install(self):
+        tool = MockAdminTool()
+        tool.run(['install'])
+        self.assertIsNotNone(tool.text)
+
+    def test_run_help(self):
+        tool = MockAdminTool()
+        tool.run(['help'])
+        self.assertIsNotNone(tool.text)

+ 272 - 0
Directory/tutorial.py

@@ -0,0 +1,272 @@
+import csv
+import os
+import re
+import shutil
+
+def del_create_analytics_folder():
+    # del the analytics folder including subfolder
+    # mkdir the analytics folder (only mkdir)
+    if os.path.exists('analytics'):
+        shutil.rmtree('analytics')
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+
+def course():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/course'):
+        shutil.rmtree('analytics/course')
+    d = {'01':'btech',
+    '11':'mtech',
+    '21':'phd',
+    '12':'msc'}
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/course'):
+            os.mkdir('analytics/course')
+        for row in reader:
+            if len(row)==0:
+                print(1)
+                continue
+            l = list(row.values())
+            head = list(row.keys())
+            stream = str(row['id'][-4:-2]).lower()
+            yr = str(row['id'][:2])
+            if str(row['id'][2:4]) in list(d.keys()):
+                degree = d[str(row['id'][2:4])]
+            else:
+                with open('analytics/course/' + 'misc.csv' , mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/course/' + 'misc.csv')==0:
+                        f_write.writerow(head)
+                    f_write.writerow(l)
+                f.close()
+                continue
+            csv_name = f'{yr}_{stream}_{degree}.csv'
+            p = re.compile(r'\d\d\d\d\D\D\d\d')
+            k = re.fullmatch(p,row['id'])
+            if k:
+                if not os.path.exists('analytics/course/'+ stream):
+                    os.mkdir('analytics/course/'+ stream) 
+                if not os.path.exists('analytics/course/'+ stream + '/' + degree):
+                    os.mkdir('analytics/course/'+ stream + '/' + degree ) 
+                with open('analytics/course/'+ stream + '/' + degree + '/' + csv_name , mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/course/'+ stream + '/' + degree + '/' + csv_name)==0:
+                        f_write.writerow(head)
+                    f_write.writerow(l)
+                f.close()
+            else:
+                with open('analytics/course/' + 'misc.csv' , mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/course/' + 'misc.csv')==0:
+                        f_write.writerow(head)
+                    f_write.writerow(l)
+                f.close()
+    csvfile.close()
+
+
+def country():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')    
+    if os.path.exists('analytics/country'):
+        shutil.rmtree('analytics/country')    
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/country'):
+            os.mkdir('analytics/country')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            with open('analytics/country/'+row['country'].lower()+ '.csv', mode = 'a') as f:
+                f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                if os.path.getsize('analytics/country/'+row['country'].lower() + '.csv')==0:
+                  f_write.writerow(head) 
+                f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+
+
+def email_domain_extract():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/email'):
+        shutil.rmtree('analytics/email')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/email'):
+            os.mkdir('analytics/email')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            if '@' in row['email'] and '.' in row['email']:
+                domain = row['email'].split('@')[1].split('.')[0]
+                with open('analytics/email/'+domain+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/email/'+ domain + '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+                f.close()
+
+            else:
+                with open('analytics/email/'+'misc'+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/email/'+ domain + '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+                f.close()
+    csvfile.close()
+
+
+
+
+
+def gender():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/gender'):
+        shutil.rmtree('analytics/gender')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/gender'):
+            os.mkdir('analytics/gender')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            gender = row['gender'].lower()
+            with open('analytics/gender/'+gender+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/gender/'+ gender + '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+def dob():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/dob'):
+        shutil.rmtree('analytics/dob')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/dob'):
+            os.mkdir('analytics/dob')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            x = str(re.sub(r"\D","-",row['dob']))
+            yr = int(x.split('-')[-1])
+            k = int(yr)%10
+            if k>4:
+                name = 'bday_' + str(yr - k + 5) + '_' + str(yr - k + 9)
+            else:
+                name = 'bday_' + str(yr - k ) + '_' + str(yr - k + 4)
+            if yr > 2014:
+                name = 'bday_2015_2020'
+            with open('analytics/dob/'+name+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/dob/'+name+ '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+            f.close()
+        
+
+
+
+def state():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/state'):
+        shutil.rmtree('analytics/state')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/state'):
+            os.mkdir('analytics/state')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            with open('analytics/state/'+row['state'].lower()+ '.csv', mode = 'a') as f:
+                f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                if os.path.getsize('analytics/state/'+row['state'].lower() + '.csv')==0:
+                  f_write.writerow(head) 
+                f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+def blood_group():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/blood_group'):
+        shutil.rmtree('analytics/blood_group')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/blood_group'):
+            os.mkdir('analytics/blood_group')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            with open('analytics/blood_group/'+row['blood_group']+ '.csv', mode = 'a') as f:
+                f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                if os.path.getsize('analytics/blood_group/'+row['blood_group'] + '.csv')==0:
+                    f_write.writerow(head) 
+                f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+# Create the new file here and also sort it in this function only.
+def new_file_sort():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    new = []
+    head = []
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)       
+        for row in reader:
+            head = list(row.keys())
+            del head[1]
+            head.insert(1,'first_name')
+            head.insert(2,'last_name')
+            k = list(row.values())
+            del k[1]
+            k.insert(1,row['full_name'].split()[0])
+            k.insert(2,' '.join(row['full_name'].split()[1:]))
+            new.append(k)
+    csvfile.close()
+    with open('analytics/studentinfo_cs384_names_split.csv', newline='',mode='w') as f:
+        f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+        f_write.writerow(head)
+        for i in new:
+            f_write.writerow(i)
+    f.close()
+    #sorting
+    dic = {}
+    for i in new:
+        dic[i[1]]='#$%^&*'.join(i)
+    new = []
+    with open('analytics/studentinfo_cs384_names_split_sorted_first_name.csv', mode = 'w') as f:
+        print
+    f.close()
+    for i in sorted(dic.items()):
+        new.append(i[1].split('#$%^&*'))
+    with open('analytics/studentinfo_cs384_names_split_sorted_first_name.csv', mode = 'a') as f:
+        f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+        f_write.writerow(head)
+        for i in new:
+            f_write.writerow(i)
+    f.close()
+
+#if __name__ == "__main__":
+#     del_create_analytics_folder()
+#     course()
+#     blood_group()
+#     new_file_sort()s
+#     state()
+#     email_domain_extract()
+#     state()
+#     gender()
+ #   dob()

+ 52 - 0
Directory/utils.py

@@ -0,0 +1,52 @@
+import numpy as np
+import torch
+
+from model import DeepGL
+
+from logger import Logger
+
+import os
+
+
+def save_parameters(args, run_name):
+    with open(os.path.join(args.log_path, run_name)+'/parameters.txt', 'w') as f:
+        f.write('num_blocks {}, lr {}, beta1 {} beta2 {}, batch_size {} gamma  {} scheduler_step {}'.format(
+            args.num_blocks, args.lr, args.beta1, args.beta2, args.batch_size, 
+            args.gamma, args.scheduler_step
+        ))
+
+
+def prepare_directories(args, run_name):
+    if not os.path.isdir(args.data_path):
+        raise Exception("Invalid data path. No such directory")
+
+    if not os.path.isdir(args.log_path):
+        os.makedirs(args.log_path)
+
+    if args.pretrained_path:
+        if not os.path.isdir(args.pretrained_path) or \
+                not os.path.isdir(os.path.join(args.pretrained_path, 'states')):
+            raise Exception("Invalid path. No such directory with pretrained model")
+
+    else:
+        exp_path = os.path.join(args.log_path, run_name)
+        os.makedirs(exp_path)
+        os.makedirs(os.path.join(exp_path, 'samples'))
+        os.makedirs(os.path.join(exp_path, 'states'))
+        os.makedirs(os.path.join(exp_path, 'tensorboard_logs'))
+
+
+def build_model(args):
+    model = DeepGL(args.num_blocks)
+    if args.pretrained_path:
+        model.load_state_dict(torch.load(
+            os.path.join(args.pretrained_path, 'samples') + '/' + str(args.load_step) + '.pt'))
+
+    return model
+
+
+def prepare_logger(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+    logger = Logger(path)
+    return logger

+ 632 - 0
Hash/EncrypC.py

@@ -0,0 +1,632 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import hashlib
+import os
+import sys
+import threading
+import tkinter as tk
+from pathlib import Path
+from tkinter import *
+from tkinter import filedialog, messagebox
+
+from Cryptodome.Cipher import AES
+
+
+class EncryptionTool:
+    def __init__(
+        self,
+        user_file,
+        user_key,
+        user_salt,
+    ):
+
+        # get the path to input file
+
+        self.user_file = user_file
+
+        self.input_file_size = os.path.getsize(self.user_file)
+        self.chunk_size = 1024
+        self.total_chunks = self.input_file_size // self.chunk_size + 1
+
+        # convert the key and salt to bytes
+
+        self.user_key = bytes(user_key, "utf-8")
+        self.user_salt = bytes(user_key[::-1], "utf-8")
+
+        # get the file extension
+
+        self.file_extension = self.user_file.split(".")[-1]
+
+        # hash type for hashing key and salt
+
+        self.hash_type = "SHA256"
+
+        # encrypted file name
+
+        self.encrypt_output_file = (
+            ".".join(self.user_file.split(".")[:-1])
+            + "."
+            + self.file_extension
+            + ".encr"
+        )
+
+        # decrypted file name
+
+        self.decrypt_output_file = self.user_file[:-5].split(".")
+        self.decrypt_output_file = (
+            ".".join(self.decrypt_output_file[:-1])
+            + "_decrypted."
+            + self.decrypt_output_file[-1]
+        )
+
+        # dictionary to store hashed key and salt
+
+        self.hashed_key_salt = dict()
+
+        # hash key and salt into 16 bit hashes
+
+        self.hash_key_salt()
+
+    def read_in_chunks(self, file_object, chunk_size=1024):
+        """Lazy function (generator) to read a file piece by piece.
+        Default chunk size: 1k.
+        """
+
+        while True:
+            data = file_object.read(chunk_size)
+            if not data:
+                break
+            yield data
+
+    def encrypt(self):
+
+        # create a cipher object
+
+        cipher_object = AES.new(
+            self.hashed_key_salt["key"], AES.MODE_CFB, self.hashed_key_salt["salt"]
+        )
+
+        self.abort()  # if the output file already exists, remove it first
+
+        input_file = open(self.user_file, "rb")
+        output_file = open(self.encrypt_output_file, "ab")
+        done_chunks = 0
+
+        for piece in self.read_in_chunks(input_file, self.chunk_size):
+            encrypted_content = cipher_object.encrypt(piece)
+            output_file.write(encrypted_content)
+            done_chunks += 1
+            yield done_chunks / self.total_chunks * 100
+
+        input_file.close()
+        output_file.close()
+
+        # clean up the cipher object
+
+        del cipher_object
+
+    def decrypt(self):
+
+        #  exact same as above function except in reverse
+
+        cipher_object = AES.new(
+            self.hashed_key_salt["key"], AES.MODE_CFB, self.hashed_key_salt["salt"]
+        )
+
+        self.abort()  # if the output file already exists, remove it first
+
+        input_file = open(self.user_file, "rb")
+        output_file = open(self.decrypt_output_file, "xb")
+        done_chunks = 0
+
+        for piece in self.read_in_chunks(input_file):
+            decrypted_content = cipher_object.decrypt(piece)
+            output_file.write(decrypted_content)
+            done_chunks += 1
+            yield done_chunks / self.total_chunks * 100
+
+        input_file.close()
+        output_file.close()
+
+        # clean up the cipher object
+
+        del cipher_object
+
+    def abort(self):
+        if os.path.isfile(self.encrypt_output_file):
+            os.remove(self.encrypt_output_file)
+        if os.path.isfile(self.decrypt_output_file):
+            os.remove(self.decrypt_output_file)
+
+    def hash_key_salt(self):
+
+        # --- convert key to hash
+        #  create a new hash object
+
+        hasher = hashlib.new(self.hash_type)
+        hasher.update(self.user_key)
+
+        # turn the output key hash into 32 bytes (256 bits)
+
+        self.hashed_key_salt["key"] = bytes(hasher.hexdigest()[:32], "utf-8")
+
+        # clean up hash object
+
+        del hasher
+
+        # --- convert salt to hash
+        #  create a new hash object
+
+        hasher = hashlib.new(self.hash_type)
+        hasher.update(self.user_salt)
+
+        # turn the output salt hash into 16 bytes (128 bits)
+
+        self.hashed_key_salt["salt"] = bytes(hasher.hexdigest()[:16], "utf-8")
+
+        # clean up hash object
+
+        del hasher
+
+
+class MainWindow:
+
+    """GUI Wrapper"""
+
+    # configure root directory path relative to this file
+
+    THIS_FOLDER_G = ""
+    if getattr(sys, "frozen", False):
+
+        # frozen
+
+        THIS_FOLDER_G = os.path.dirname(sys.executable)
+    else:
+
+        # unfrozen
+
+        THIS_FOLDER_G = os.path.dirname(os.path.realpath(__file__))
+
+    def __init__(self, root):
+        self.root = root
+        self._cipher = None
+        self._file_url = tk.StringVar()
+        self._secret_key = tk.StringVar()
+        self._secret_key_check = tk.StringVar()
+        self._salt = tk.StringVar()
+        self._status = tk.StringVar()
+        self._status.set("---")
+
+        self.should_cancel = False
+
+        root.title("EncrypC")
+        root.configure(bg="#eeeeee")
+
+        try:
+            icon_img = tk.Image(
+                "photo", file=self.THIS_FOLDER_G + "./files/encrypc.ico"
+            )
+            root.call("wm", "iconphoto", root._w, icon_img)
+        except Exception:
+            pass
+
+        self.menu_bar = tk.Menu(root, bg="#eeeeee", relief=tk.FLAT)
+        self.menu_bar.add_command(label="Help!", command=self.show_help_callback)
+        self.menu_bar.add_command(label="About", command=self.show_about)
+
+        root.configure(menu=self.menu_bar)
+
+        self.file_entry_label = tk.Label(
+            root,
+            text="Enter File Path Or Click SELECT FILE Button",
+            bg="#eeeeee",
+            anchor=tk.W,
+        )
+        self.file_entry_label.grid(
+            padx=12,
+            pady=(8, 0),
+            ipadx=0,
+            ipady=1,
+            row=0,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.file_entry = tk.Entry(
+            root,
+            textvariable=self._file_url,
+            bg="#fff",
+            exportselection=0,
+            relief=tk.FLAT,
+        )
+        self.file_entry.grid(
+            padx=15,
+            pady=6,
+            ipadx=8,
+            ipady=8,
+            row=1,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.select_btn = tk.Button(
+            root,
+            text="SELECT FILE",
+            command=self.selectfile_callback,
+            width=42,
+            bg="#3498db",
+            fg="#ffffff",
+            bd=2,
+            relief=tk.FLAT,
+        )
+        self.select_btn.grid(
+            padx=15,
+            pady=8,
+            ipadx=24,
+            ipady=6,
+            row=2,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.key_entry_label1 = tk.Label(
+            root,
+            text="Enter Key (To be Remembered while Decryption)",
+            bg="#eeeeee",
+            anchor=tk.W,
+        )
+        self.key_entry_label1.grid(
+            padx=12,
+            pady=(8, 0),
+            ipadx=0,
+            ipady=1,
+            row=3,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.key_entry1 = tk.Entry(
+            root,
+            textvariable=self._secret_key,
+            bg="#fff",
+            exportselection=0,
+            relief=tk.FLAT,
+        )
+        self.key_entry1.grid(
+            padx=15,
+            pady=6,
+            ipadx=8,
+            ipady=8,
+            row=4,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.key_entry_label2 = tk.Label(
+            root, text="Re-enter Key (Validation)", bg="#eeeeee", anchor=tk.W
+        )
+        self.key_entry_label2.grid(
+            padx=12,
+            pady=(8, 0),
+            ipadx=0,
+            ipady=1,
+            row=5,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.key_entry2 = tk.Entry(
+            root,
+            textvariable=self._secret_key_check,
+            bg="#fff",
+            exportselection=0,
+            relief=tk.FLAT,
+        )
+        self.key_entry2.grid(
+            padx=15,
+            pady=6,
+            ipadx=8,
+            ipady=8,
+            row=6,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.encrypt_btn = tk.Button(
+            root,
+            text="ENCRYPT",
+            command=self.e_check_callback,
+            bg="#27ae60",
+            fg="#ffffff",
+            bd=2,
+            relief=tk.FLAT,
+        )
+        self.encrypt_btn.grid(
+            padx=15,
+            pady=8,
+            ipadx=24,
+            ipady=6,
+            row=7,
+            column=0,
+            columnspan=2,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.decrypt_btn = tk.Button(
+            root,
+            text="DECRYPT",
+            command=self.d_check_callback,
+            bg="#27ae60",
+            fg="#ffffff",
+            bd=2,
+            relief=tk.FLAT,
+        )
+        self.decrypt_btn.grid(
+            padx=15,
+            pady=8,
+            ipadx=24,
+            ipady=6,
+            row=7,
+            column=2,
+            columnspan=2,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.reset_btn = tk.Button(
+            root,
+            text="CLEAR",
+            command=self.reset_callback,
+            bg="#717d7e",
+            fg="#ffffff",
+            bd=2,
+            relief=tk.FLAT,
+        )
+        self.reset_btn.grid(
+            padx=15,
+            pady=8,
+            ipadx=24,
+            ipady=6,
+            row=8,
+            column=0,
+            columnspan=2,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.stop_btn = tk.Button(
+            root,
+            text="STOP",
+            command=self.cancel_callback,
+            bg="#aaaaaa",
+            fg="#ffffff",
+            bd=2,
+            state="disabled",
+            relief=tk.FLAT,
+        )
+        self.stop_btn.grid(
+            padx=15,
+            pady=8,
+            ipadx=24,
+            ipady=6,
+            row=8,
+            column=2,
+            columnspan=2,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        self.status_label = tk.Label(
+            root,
+            textvariable=self._status,
+            bg="#eeeeee",
+            anchor=tk.W,
+            justify=tk.LEFT,
+            relief=tk.FLAT,
+            wraplength=350,
+        )
+        self.status_label.grid(
+            padx=12,
+            pady=(0, 12),
+            ipadx=0,
+            ipady=1,
+            row=9,
+            column=0,
+            columnspan=4,
+            sticky=tk.W + tk.E + tk.N + tk.S,
+        )
+
+        tk.Grid.columnconfigure(root, 0, weight=1)
+        tk.Grid.columnconfigure(root, 1, weight=1)
+        tk.Grid.columnconfigure(root, 2, weight=1)
+        tk.Grid.columnconfigure(root, 3, weight=1)
+
+    def selectfile_callback(self):
+        try:
+            name = filedialog.askopenfile()
+            self._file_url.set(name.name)
+        except Exception as e:
+            self._status.set(e)
+            self.status_label.update()
+
+    def freeze_controls(self):
+        self.file_entry.configure(state="disabled")
+        self.key_entry1.configure(state="disabled")
+        self.key_entry2.configure(state="disabled")
+        self.select_btn.configure(state="disabled", bg="#aaaaaa")
+        self.encrypt_btn.configure(state="disabled", bg="#aaaaaa")
+        self.decrypt_btn.configure(state="disabled", bg="#aaaaaa")
+        self.reset_btn.configure(state="disabled", bg="#aaaaaa")
+        self.stop_btn.configure(state="normal", bg="#e74c3c")
+        self.status_label.update()
+
+    def unfreeze_controls(self):
+        self.file_entry.configure(state="normal")
+        self.key_entry1.configure(state="normal")
+        self.key_entry2.configure(state="normal")
+        self.select_btn.configure(state="normal", bg="#3498db")
+        self.encrypt_btn.configure(state="normal", bg="#27ae60")
+        self.decrypt_btn.configure(state="normal", bg="#27ae60")
+        self.reset_btn.configure(state="normal", bg="#717d7e")
+        self.stop_btn.configure(state="disabled", bg="#aaaaaa")
+        self.status_label.update()
+
+    def e_check_callback(self):
+
+        newPath = Path(self._file_url.get())
+        if newPath.is_file():
+            pass
+        else:
+            messagebox.showinfo("EncrypC", "Please Enter a valid File URL !!")
+            return
+
+        if len(self._secret_key.get()) == 0:
+            messagebox.showinfo("EncrypC", "Please Enter a valid Secret Key !!")
+            return
+        elif self._secret_key.get() != self._secret_key_check.get():
+            messagebox.showinfo("EncrypC", "Passwords do not match !!")
+            return
+
+        self.encrypt_callback()
+
+    def d_check_callback(self):
+
+        newPath = Path(self._file_url.get())
+        if newPath.is_file():
+            pass
+        else:
+            messagebox.showinfo("EncrypC", "Please Enter a valid File URL !!")
+            return
+
+        if self._file_url.get()[-4:] != "encr":
+            messagebox.showinfo(
+                "EncrypC",
+                """Provided File is not an Encrypted File !!
+Please Enter an Encrypted File to Decrypt.""",
+            )
+            return
+
+        if len(self._secret_key.get()) == 0:
+            messagebox.showinfo("EncrypC", "Please Enter a Secret Key !!")
+            return
+        elif self._secret_key.get() != self._secret_key_check.get():
+            messagebox.showinfo("EncrypC", "Passwords do not match !!")
+            return
+
+        self.decrypt_callback()
+
+    def encrypt_callback(self):
+        t1 = threading.Thread(target=self.encrypt_execute)
+        t1.start()
+
+    def encrypt_execute(self):
+        self.freeze_controls()
+
+        try:
+            self._cipher = EncryptionTool(
+                self._file_url.get(), self._secret_key.get(), self._salt.get()
+            )
+            for percentage in self._cipher.encrypt():
+                if self.should_cancel:
+                    break
+                percentage = "{0:.2f}%".format(percentage)
+                self._status.set(percentage)
+                self.status_label.update()
+
+            if self.should_cancel:
+                self._cipher.abort()
+                self._status.set("Cancellation Successful !!")
+                messagebox.showinfo("EncrypC", "Cancellation Successful !!")
+                self._cipher = None
+                self.should_cancel = False
+                self.unfreeze_controls()
+                return
+
+            self._cipher = None
+            self.should_cancel = False
+            self._status.set("File Encryption Successful !!")
+            messagebox.showinfo("EncrypC", "File Encryption Successful !!")
+        except Exception as e:
+
+            self._status.set(e)
+
+        self.unfreeze_controls()
+
+    def decrypt_callback(self):
+        t2 = threading.Thread(target=self.decrypt_execute)
+        t2.start()
+
+    def decrypt_execute(self):
+        self.freeze_controls()
+
+        try:
+            self._cipher = EncryptionTool(
+                self._file_url.get(), self._secret_key.get(), self._salt.get()
+            )
+            for percentage in self._cipher.decrypt():
+                if self.should_cancel:
+                    break
+                percentage = "{0:.2f}%".format(percentage)
+                self._status.set(percentage)
+                self.status_label.update()
+
+            if self.should_cancel:
+                self._cipher.abort()
+                self._status.set("Cancellation Successful !!")
+                messagebox.showinfo("EncrypC", "Cancellation Successful !!")
+                self._cipher = None
+                self.should_cancel = False
+                self.unfreeze_controls()
+                return
+
+            self._cipher = None
+            self.should_cancel = False
+            self._status.set("File Decryption Successful !!")
+            messagebox.showinfo("EncrypC", "File Decryption Successful !!")
+        except Exception as e:
+
+            self._status.set(e)
+
+        self.unfreeze_controls()
+
+    def reset_callback(self):
+        self._cipher = None
+        self._file_url.set("")
+        self._secret_key.set("")
+        self._salt.set("")
+        self._status.set("---")
+
+    def cancel_callback(self):
+        self.should_cancel = True
+
+    def show_help_callback(self):
+        messagebox.showinfo(
+            "Tutorial",
+            """1. Open the Application and Click SELECT FILE Button to select your file e.g. "mydoc.pdf" (OR You can add path manually).
+2. Enter your Key (This should be alphanumeric letters). Remember this so you can Decrypt the file later. (Else you'll lose your file permanently)
+3. Click ENCRYPT Button to encrypt the file. A new encrypted file with ".encr" extention e.g. "mydoc.pdf.encr" will be created in the same directory where the "mydoc.pdf" is.
+4. When you want to Decrypt a file you, will select the file with the ".encr" extention and Enter your Key which you chose at the time of Encryption. Click DECRYPT Button to decrypt. The decrypted file will be of the same name as before with the suffix "decrypted" for e.g. "mydoc_decrypted.pdf".
+5. Click CLEAR Button to reset the input fields and status bar.""",
+        )
+
+    def show_about(self):
+        messagebox.showinfo(
+            "EncrypC v1.2.0",
+            """EncrypC is a File Encryption Tool based on AES Algorithm. 
+Managed by Dhruv Panchal.
+https://github.com/dhhruv""",
+        )
+
+
+if __name__ == "__main__":
+    ROOT = tk.Tk()
+    MAIN_WINDOW = MainWindow(ROOT)
+    bundle_dir = getattr(sys, "_MEIPASS", os.path.abspath(os.path.dirname(__file__)))
+    path_to_ico = os.path.abspath(os.path.join(bundle_dir, "encrypc.ico"))
+    ROOT.iconbitmap(path_to_ico)
+    ROOT.resizable(height=False, width=False)
+    ROOT.mainloop()

+ 139 - 0
Hash/EncryptionDecryption.py

@@ -0,0 +1,139 @@
+#!/usr/bin/python3
+
+"""
+Created on Wed Aug 03 15:34:06 2016
+
+@author: RAVI TEJA AINAMPUDI
+"""
+from Crypto.Hash import MD5
+from Crypto.Cipher import AES
+from Crypto import Random
+import os
+import sys
+
+
+class DynamicEncryptionAndDecryption(object):
+    def __init__(self, filename=None):
+        self.filename = filename
+
+    def encrypt(self, key, filename):
+        chunksize = 128 * 1024
+        outFile = os.path.join(os.path.dirname(filename), "(Secured)" + os.path.basename(filename))
+        filesize = str(os.path.getsize(filename)).zfill(16)
+        IV = Random.new().read(AES.block_size)
+        print(IV, len(IV))
+        encryptor = AES.new(key, AES.MODE_CBC, IV)
+
+        with open(filename, "rb") as infile:
+            with open(outFile, "wb") as outfile:
+                outfile.write(filesize.encode('utf-8'))
+                outfile.write(IV)
+                while True:
+                    chunk = infile.read(chunksize)
+                    if len(chunk) == 0:
+                        break
+                    elif len(chunk) % 16 != 0:
+                        chunk += b' ' * (16 - (len(chunk) % 16))
+                    outfile.write(encryptor.encrypt(chunk))
+        return outFile
+
+    def decrypt(self, key, filename):
+        outFile = os.path.join(os.path.dirname(filename),
+                               os.path.basename(filename).replace("(Secured)", ""))
+        print(outFile)
+        chunksize = 128 * 1024
+        with open(filename, "rb") as infile:
+            filesize = infile.read(16)
+            IV = infile.read(16)
+            decryptor = AES.new(key, AES.MODE_CBC, IV)
+
+            with open(outFile, "wb") as outfile:
+                while True:
+                    chunk = infile.read(chunksize)
+                    if len(chunk) == 0:
+                        break
+                    outfile.write(decryptor.decrypt(chunk))
+                outfile.truncate(int(filesize))
+        return outFile
+
+    @staticmethod
+    def allfiles(path=os.getcwd()):
+        allFiles = []
+        for root, subfiles, files in os.walk(path):
+            for dir_name in subfiles:
+                allFiles.append(os.path.join(root, dir_name))
+            for file_name in files:
+                allFiles.append(os.path.join(root, file_name))
+        return allFiles
+
+
+def choices():
+    ed_object = DynamicEncryptionAndDecryption()
+    choice = input("Do you want to L - List the Files, E - Encrypt or D - Decrypt? ==")
+    print("\n")
+    perform_multiple_encryption = input(f"Do you want to perform multi-layered encryption? Y-Yes or N-No: ")
+    if perform_multiple_encryption.lower() not in ("yes", "y"):
+        perform_multiple_encryption = False
+        password = input("Please enter the `Password/Key` to be used: ")
+    else:
+        encFiles = ed_object.allfiles()
+
+    if choice == "E":
+        print("")
+        subchoice = input("Want to encrypt all the Files ? Y- Yes or N - No ? :")
+        if subchoice == "Y":
+            for Tfiles in encFiles:
+                if os.path.basename(Tfiles).startswith("(Secured)"):
+                    print(f"{Tfiles} is already encrypted")
+                    pass
+                elif Tfiles == os.path.join(os.getcwd(), sys.argv[0]):
+                    pass
+                else:
+                    ed_object.encrypt(MD5.new(password).digest(), str(Tfiles))
+                    print(f"Done Encryption for {Tfiles}")
+                    os.remove(Tfiles)
+        elif subchoice == "N":
+            print("")
+            filename = input("Enter the Filename to Encrypt: ")
+            if not os.path.exists(filename):
+                print(f"Given file {filename} does not exist")
+                sys.exit(0)
+            elif filename.startswith("(Secured)"):
+                print(f"File {filename} was already encrypted")
+                sys.exit()
+            else:
+                ed_object.encrypt(MD5.new(password).digest(), filename)
+                print(f"Done Encryption of {filename}")
+                os.remove(filename)
+        else:
+            print("\n Enter either Y or N")
+
+    elif choice == "D":
+        print("")
+        filename = input("Enter the filename to decrypt: ")
+        if not os.path.exists(filename):
+            print(f"Given file {filename} does not exist")
+            sys.exit(0)
+        elif not filename.startswith("(Secured)"):
+            print("{filename} file was never encrypted")
+            sys.exit()
+        else:
+            ed_object.decrypt(MD5.new(password).digest(), filename)
+            print(f"Done Decryption for {filename}")
+            os.remove(filename)
+
+    elif choice == "L":
+        print(" \n The files present in the current directory are: ")
+        file_list = []
+        for content in os.listdir(".."):
+            file_list.append(content)
+        print(file_list)
+
+    else:
+        print("\n Please choose a valid command. Either E or D")
+        sys.exit()
+
+
+if __name__ == "__main__":
+    # choices()
+    pass

+ 1 - 0
Hash/Encryption_And_Hashing

@@ -0,0 +1 @@
+Subproject commit ce441e388d9ffc902bece98cdeb33794f1716f34

+ 70 - 0
Hash/base64.py

@@ -0,0 +1,70 @@
+import base64
+import time
+
+#Put the driver name (C:) followed by the directory and you
+#can encrypt any file on a PC.
+
+def menu():
+    time.sleep(0.5)
+    choice = input("\n1. encrypt File\n2. encrypt message\n3. decrypt message\n4. exit\nchoice: ")
+    if choice == "1":
+        encryptFile()
+        
+    elif choice == "2":
+        encrypt()
+    
+        
+    elif choice == "3":
+        decryptMessage()
+        
+    elif choice == "4":
+        exit()
+        
+    
+        
+    else:
+        print ("Not a valid choice.")
+        
+
+def encryptFile():
+    myFile = input("enter file to encrypt: ")
+    file = open(myFile,"r")
+    contents = file.read()
+    contents = contents.encode()
+    file = open(myFile, "w")
+    encoded = base64.b64encode(contents)
+    # the .decode() converts the bytes to str, taking off the b'...'
+    file.write(str(encoded))
+    print ("File is now encrypted... and the contents is unreadable")
+
+
+
+def decryptMessage():
+    pwd = "N3VIQUJmZ2pyNDVkZDRvMzNkZmd0NzBkZzlLOWRmcjJ0NWhCdmRm"
+    key = base64.b64decode(pwd) #the decoded version of this is the key.
+    value = input("Enter the decryption key: ").encode()
+    if value == key:
+        time.sleep(1)
+        message = input("Enter the message to decode: ")
+        decoded = base64.b64decode(message)
+        print (decoded)
+        menu()
+        
+    else:
+        print("Decryption key is wrong.")
+        menu()
+
+
+def encrypt():
+    password = input("Enter a message: ").encode()
+    encoded = base64.b64encode(password)
+    print (encoded.decode()) 
+    menu()
+
+
+def hashing(password):
+    hash1 = hashlib.md5(str.encode(password)).hexdigest()
+    print ("your hashed password is:", hash1,"\n")
+
+menu() 
+    

+ 485 - 0
Hash/biometry_hash.py

@@ -0,0 +1,485 @@
+
+#usr/share/doc/python-fingerprint
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import hashlib
+import os
+import tempfile
+import time
+from pyfingerprint.pyfingerprint import PyFingerprint
+import os, random
+from Crypto.Cipher import AES
+from Crypto.Hash import SHA256
+
+def enroll():
+    
+    ## Enrolls new finger
+    ##
+
+    ## Tries to initialize the sensor
+    try:
+        f = PyFingerprint('/dev/ttyUSB0', 57600, 0xFFFFFFFF, 0x00000000)
+
+        if ( f.verifyPassword() == False ):
+            raise ValueError('The given fingerprint sensor password is wrong!')
+
+    except Exception as e:
+        print('The fingerprint sensor could not be initialized!')
+        print('Exception message: ' + str(e))
+        exit(1)
+
+
+    ## Gets some sensor information
+    print('Currently stored templates: ' + str(f.getTemplateCount()))
+
+    ## Tries to enroll new finger
+    try:
+        print('Waiting for finger...')
+
+        ## Wait that finger is read
+        while ( f.readImage() == False ):
+            pass
+
+        ## Converts read image to characteristics and stores it in charbuffer 1
+        f.convertImage(0x01)
+        
+        ## Checks if finger is already enrolled
+        result = f.searchTemplate()
+        positionNumber = result[0]
+        
+  ## Gets new position number (the counting starts at 0, so we do not need to increment)
+        #positionNumber = f.getTemplateCount()
+        
+        if ( positionNumber >= 0 ):
+            f.loadTemplate(positionNumber, 0x01)
+            characterics = str(f.downloadCharacteristics(0x01))
+            passhashes = hashlib.sha256(characterics).hexdigest()
+            passhash = passhashes[0:32]
+            print('Template already exists at position #' + str(positionNumber))
+            return passhash
+
+            
+        print('Remove finger...')
+        time.sleep(2)
+
+        print('Waiting for same finger again...')
+
+        ## Wait that finger is read again
+        while ( f.readImage() == False ):
+            pass
+
+        ## Converts read image to characteristics and stores it in charbuffer 2
+        f.convertImage(0x02)
+
+        ## Compares the charbuffers and creates a template
+        f.createTemplate()
+
+        ## Gets new position number (the counting starts at 0, so we do not need to increment)
+        positionNumber = f.getTemplateCount()
+
+        ## Saves template at new position number
+        if ( f.storeTemplate(positionNumber) == True ):
+            print('Finger enrolled successfully!')
+            print('New template position #' + str(positionNumber))
+            ## Hashes characteristics of template
+            
+            characterics = str(f.downloadCharacteristics(0x01))
+            passhashes = hashlib.sha256(characterics).hexdigest()
+            passhash = passhashes[0:32]
+            ## Hashes characteristics of template
+            print('SHA-2 hash of template: ' + passhash)
+            return passhash
+            
+    except Exception as e:
+        print('Operation failed!')
+        print('Exception message: ' + str(e))
+        exit(1)
+        #print('SHA-2 hash of template: ' + hashlib.sha256(characterics).hexdigest())
+            
+    
+def index():
+    
+
+    ## Shows the template index table
+    ##
+
+    ## Tries to initialize the sensor
+    try:
+        f = PyFingerprint('/dev/ttyUSB0', 57600, 0xFFFFFFFF, 0x00000000)
+
+        if ( f.verifyPassword() == False ):
+            raise ValueError('The given fingerprint sensor password is wrong!')
+
+    except Exception as e:
+        print('The fingerprint sensor could not be initialized!')
+        print('Exception message: ' + str(e))
+        exit(1)
+
+    ## Gets some sensor information
+    print('Currently stored templates: ' + str(f.getTemplateCount()))
+
+    ## Tries to show a template index table page
+    try:
+        page = raw_input('Please enter the index page (0, 1, 2, 3) you want to see: ')
+        page = int(page)
+
+        tableIndex = f.getTemplateIndex(page)
+
+        for i in range(0, len(tableIndex)):
+            print('Template at position #' + str(i) + ' is used: ' + str(tableIndex[i]))
+
+    except Exception as e:
+        print('Operation failed!')
+        print('Exception message: ' + str(e))
+        exit(1)
+def fp_download():
+
+    ## Reads image and download it
+
+    ## Tries to initialize the sensor
+    try:
+        f = PyFingerprint('/dev/ttyUSB0', 57600, 0xFFFFFFFF, 0x00000000)
+
+        if ( f.verifyPassword() == False ):
+            raise ValueError('The given fingerprint sensor password is wrong!')
+
+    except Exception as e:
+        print('The fingerprint sensor could not be initialized!')
+        print('Exception message: ' + str(e))
+        exit(1)
+
+    ## Gets some sensor information
+    print('Currently stored templates: ' + str(f.getTemplateCount()))
+
+    ## Tries to read image and download it
+    try:
+        print('Waiting for finger...')
+
+        ## Wait that finger is read
+        while ( f.readImage() == False ):
+            pass
+
+        print('Downloading image (this take a while)...')
+
+        imageDestination =  tempfile.gettempdir() + '/fingerprint.bmp'
+        f.downloadImage(imageDestination)
+
+        print('The image was saved to "' + imageDestination + '".')
+
+    except Exception as e:
+        print('Operation failed!')
+        print('Exception message: ' + str(e))
+        exit(1)
+
+def delete():
+        
+    from pyfingerprint.pyfingerprint import PyFingerprint
+
+
+    ## Deletes a finger from sensor
+    ##
+
+
+    ## Tries to initialize the sensor
+    try:
+        f = PyFingerprint('/dev/ttyUSB0', 57600, 0xFFFFFFFF, 0x00000000)
+
+        if ( f.verifyPassword() == False ):
+            raise ValueError('The given fingerprint sensor password is wrong!')
+
+    except Exception as e:
+        print('The fingerprint sensor could not be initialized!')
+        print('Exception message: ' + str(e))
+        exit(1)
+
+    ## Gets some sensor information
+    print('Currently stored templates: ' + str(f.getTemplateCount()))
+
+    ## Tries to delete the template of the finger
+    try:
+        positionNumber = raw_input('Please enter the template position you want to delete: ')
+        positionNumber = int(positionNumber)
+
+        if ( f.deleteTemplate(positionNumber) == True ):
+            print('Template deleted!')
+
+    except Exception as e:
+        print('Operation failed!')
+        print('Exception message: ' + str(e))
+        exit(1)
+
+def fp_search():
+        
+    """
+    PyFingerprint
+    Copyright (C) 2015 Bastian Raschke <bastian.raschke@posteo.de>
+    All rights reserved.
+
+    @author: Bastian Raschke <bastian.raschke@posteo.de>
+    """
+
+
+    ## Search for a finger
+    ##
+
+    ## Tries to initialize the sensor
+    try:
+        f = PyFingerprint('/dev/ttyUSB0', 57600, 0xFFFFFFFF, 0x00000000)
+
+        if ( f.verifyPassword() == False ):
+            raise ValueError('The given fingerprint sensor password is wrong!')
+
+    except Exception as e:
+        print('The fingerprint sensor could not be initialized!')
+        print('Exception message: ' + str(e))
+        exit(1)
+
+    ## Gets some sensor information
+    print('Currently stored templates: ' + str(f.getTemplateCount()))
+
+    ## Tries to search the finger and calculate hash
+    try:
+        print('Waiting for finger...')
+
+        ## Wait that finger is read
+        while ( f.readImage() == False ):
+            pass
+
+        ## Converts read image to characteristics and stores it in charbuffer 1
+        f.convertImage(0x01)
+
+        ## Searchs template
+        result = f.searchTemplate()
+
+        positionNumber = result[0]
+        accuracyScore = result[1]
+
+        if ( positionNumber == -1 ):
+            print('No match found!')
+            exit(0)
+        else:
+            print('Found template at position #' + str(positionNumber))
+            print('The accuracy score is: ' + str(accuracyScore))
+
+        ## OPTIONAL stuff
+        ##
+
+        ## Loads the found template to charbuffer 1
+        f.loadTemplate(positionNumber, 0x01)
+
+        ## Downloads the characteristics of template loaded in charbuffer 1
+        characterics = str(f.downloadCharacteristics(0x01))
+
+        ## Hashes characteristics of template
+        print('SHA-2 hash of template: ' + hashlib.sha256(characterics).hexdigest())
+
+    except Exception as e:
+        print('Operation failed!')
+        print('Exception message: ' + str(e))
+        exit(1)
+def get_decision():#initial message at the start at the program
+    decision = ""
+    
+    decision = raw_input()
+    return decision
+
+def AES_full(passhash):
+    EncDec = passhash
+    choice1 = ""
+    while choice1 is not "1" and choice1 is not "2" :
+       
+        choice1 = raw_input()
+   
+
+    if choice1 == "1":
+        print ("\nEncryption/Decryption")
+        AESmenu(EncDec)
+
+    if choice1 == "2":
+        print ("\nMain Menu")
+        main()
+        
+def encrypt(key, filename):
+	chunksize = 64*1024
+	#print filename
+	#print "4th time: ", key
+	outputFile = "(encrypted)"+filename
+	filesize = str(os.path.getsize(filename)).zfill(16)
+	IV = ''
+
+	for i in range(16):
+		IV += chr(random.randint(0, 0xFF))
+
+	encryptor = AES.new(key, AES.MODE_CBC, IV)
+
+	with open(filename, 'rb') as infile:
+		with open(outputFile, 'wb') as outfile:
+			outfile.write(filesize)
+			outfile.write(IV)
+			
+			while True:
+				chunk = infile.read(chunksize)
+				
+				if len(chunk) == 0:
+					break
+				elif len(chunk) % 16 != 0:
+					chunk += ' ' * (16 - (len(chunk) % 16))
+
+				outfile.write(encryptor.encrypt(chunk))
+
+
+def decrypt(key, filename):
+	chunksize = 64*1024
+	outputFile = filename[11:]
+	
+	with open(filename, 'rb') as infile:
+		filesize = long(infile.read(16))
+		IV = infile.read(16)
+
+		decryptor = AES.new(key, AES.MODE_CBC, IV)
+
+		with open(outputFile, 'wb') as outfile:
+			while True:
+				chunk = infile.read(chunksize)
+
+				if len(chunk) == 0:
+					break
+
+				outfile.write(decryptor.decrypt(chunk))
+			outfile.truncate(filesize)
+
+"""def getKey(password):
+	hasher = SHA256.new(password)
+	return hasher.digest()"""
+def file_check(filename):
+    try:
+        open(filename, 'r')
+        return 1
+    except:
+        print ("This file doesnt exist")
+        return 0
+    
+
+
+def AESmenu(EncDec):
+    
+    choice = raw_input("Would you like to (E)ncrypt or (D)ecrypt?: ")
+    f_in = raw_input("Insert the filename with extension: ")
+    fileblob = file_check(f_in)
+    while fileblob == 0:
+        f_in = raw_input("insert the filename with extensions")
+        fileblob = file_check(f_in)
+    print( f_in  )  
+    #print "3rd time: ", EncDec
+    if choice == 'E':
+            #filename = raw_input("File to encrypt: ")
+            encrypt(EncDec, f_in)
+            print ("Done.")
+    elif choice == 'D':
+            decrypt(EncDec, f_in)
+            print( "Done.")
+    else:
+            print ("No Option selected, closing...")
+
+        
+def main ():
+
+
+    decision = get_decision()
+    while decision is not "quit" and decision is not "2":
+        print( "Please choose form the following.")
+        choice = ""
+        while choice is not "1" and choice is not "2" and choice is not "3" and choice is not "4" and choice is not "5":
+        
+            choice = raw_input()
+        
+        if choice == "1":
+            print ("\nBiometric Enrollment")
+            hashcode = enroll()
+            print (hashcode)
+            AES_full(hashcode)
+                
+        if choice == "2":
+            print( "\nShow Biometric Index")
+            index()
+            
+        if choice == "3":
+            print ("\nSearch Index Using Biometrics")
+            fp_search()
+            
+        if choice == "4":
+            print ("\nDownload Biometric Image")
+            fp_download()
+            
+        if choice == "5":
+            print ("\nDelete Biometric Image")
+            delete()
+    
+    
+            
+       
+            '''
+            print "The message inserted was: %s " % message
+            ciphertext = encrypt(message , m)
+            print "Lets apply RC4 with salt for security"
+            cipherplay = ''.join(ciphertext)
+            RC = encryptRC(cipherplay, matrix_key)
+            print "Message with RC4 with salt"
+            print RC
+            print "Do you want to save the message to a file? (Y/N)"
+            answer = raw_input()
+            while answer is not "Y" and answer is not "y" and answer is not "N" and answer is not "n":
+                print "Do you want to save the message to a file? (Y/N)"
+                answer = raw_input()
+            if answer == "Y" or answer == "y":
+                fout = open('Encrypted.txt', 'w')
+                fout.write(RC)
+                fout.close()
+                print "Message save to file Encrypted.txt on the folder of this program\n"
+                print "Press Enter to continue:"
+                raw_input()
+            else:
+                print "Press enter to continue"
+                raw_input()
+            decision = get_decision()
+        
+        elif choice == "2":
+            ch_enc = ""
+            print "\nMessage Decryption:"
+            print "1 - Open a custom file"
+            print "2 - Copy the encrypted message"
+            ch_enc = raw_input()
+            if ch_enc == "1":
+                print "Insert the filename with extension"
+                f_in = raw_input()
+                file = file_check(f_in)
+                
+                while file == 0:
+                    print "Insert the filename with the extension"
+                    f_in = raw_input()
+                    file = file_check(f_in)
+                    
+                fin = open(f_in, 'U')
+                RCmessage = fin.read()
+                fin.close()
+
+            elif ch_enc == "2":
+                print "Insert the encripted message:"
+                RCmessage = raw_input()
+            
+            print "You inserted ", RCmessage
+            ms = decryptRC(RCmessage, matrix_key)
+            message = get_message(ms)
+            plaintext = decrypt(message, m)
+            print "\nPress enter to continue"
+            raw_input()
+            decision = get_decision()
+        
+        else:
+            print "Wrong Choice"
+    print "Thanks for using the software \nGood Bye"    
+    raw_input()
+    '''        
+if __name__ == "__main__":
+    main()

Některé soubory nejsou zobrazeny, neboť je v těchto rozdílových datech změněno mnoho souborů