刘凡 пре 3 година
комит
ba16f626df
100 измењених фајлова са 19401 додато и 0 уклоњено
  1. BIN
      .DS_Store
  2. 3 0
      .gitignore
  3. 3 0
      .idea/.gitignore
  4. 10 0
      .idea/GnnForPrivacyScan.iml
  5. 15 0
      .idea/inspectionProfiles/Project_Default.xml
  6. 6 0
      .idea/inspectionProfiles/profiles_settings.xml
  7. 7 0
      .idea/misc.xml
  8. 8 0
      .idea/modules.xml
  9. 30 0
      .idea/vcs.xml
  10. BIN
      data/.DS_Store
  11. 65 0
      data/edge.txt
  12. 108 0
      data/node.txt
  13. BIN
      data/purposeCombined/.DS_Store
  14. 3 0
      data/purposeCombined/Azure/.vscode/settings.json
  15. 82 0
      data/purposeCombined/Azure/AddUp/Azure-blob-storage.py
  16. 51 0
      data/purposeCombined/Azure/AddUp/blob-upload-1.py
  17. 221 0
      data/purposeCombined/Azure/AddUp/blob-upload-2.py
  18. 107 0
      data/purposeCombined/Azure/AddUp/blob-upload.py
  19. 231 0
      data/purposeCombined/Azure/AddUp/circuitbreaker.py
  20. 138 0
      data/purposeCombined/Azure/AddUp/datafactory.py
  21. 202 0
      data/purposeCombined/Azure/AddUp/file_advanced_samples.py
  22. 190 0
      data/purposeCombined/Azure/AddUp/file_basic_samples.py
  23. 415 0
      data/purposeCombined/Azure/AddUp/python-quick-start.py
  24. 218 0
      data/purposeCombined/Azure/AddUp/table_advanced_samples.py
  25. 96 0
      data/purposeCombined/Azure/AddUp/table_basic_samples.py
  26. 1 0
      data/purposeCombined/Azure/AzureStorage
  27. 125 0
      data/purposeCombined/Azure/DLfile.py
  28. 1 0
      data/purposeCombined/Azure/azure-multiapi-storage-python
  29. 64 0
      data/purposeCombined/Azure/blob-adapter.py
  30. 98 0
      data/purposeCombined/Azure/blob-permission.py
  31. 101 0
      data/purposeCombined/Azure/blob-upload-1.py
  32. 81 0
      data/purposeCombined/Azure/blob-upload-2.py
  33. 57 0
      data/purposeCombined/Azure/blob-upload-3.py
  34. 67 0
      data/purposeCombined/Azure/blob-upload-4.py
  35. 107 0
      data/purposeCombined/Azure/blob-upload.py
  36. 221 0
      data/purposeCombined/Azure/django-blob.py
  37. 1 0
      data/purposeCombined/Azure/python-text-classification
  38. 555 0
      data/purposeCombined/Azure/storage-blob.py
  39. 130 0
      data/purposeCombined/Azure/table-service.py
  40. 218 0
      data/purposeCombined/Azure/table-storage.py
  41. BIN
      data/purposeCombined/BI/.DS_Store
  42. 47 0
      data/purposeCombined/BI/BIL.py
  43. 1 0
      data/purposeCombined/BI/BusinessIntelligence-Kaggle
  44. 606 0
      data/purposeCombined/BI/ID3_classification.py
  45. 336 0
      data/purposeCombined/BI/Practica2.py
  46. 132 0
      data/purposeCombined/BI/apriori.py
  47. 440 0
      data/purposeCombined/BI/bi_main.py
  48. 727 0
      data/purposeCombined/BI/cube-backup.py
  49. 727 0
      data/purposeCombined/BI/cube.py
  50. 197 0
      data/purposeCombined/BI/etl_testing.py
  51. 33 0
      data/purposeCombined/BI/examples/__init__.py
  52. 63 0
      data/purposeCombined/BI/examples/bart_lines.py
  53. 763 0
      data/purposeCombined/BI/examples/birth_names.py
  54. 373 0
      data/purposeCombined/BI/examples/countries.md
  55. 2505 0
      data/purposeCombined/BI/examples/countries.py
  56. 114 0
      data/purposeCombined/BI/examples/country_map.py
  57. 100 0
      data/purposeCombined/BI/examples/css_templates.py
  58. 529 0
      data/purposeCombined/BI/examples/deck.py
  59. 141 0
      data/purposeCombined/BI/examples/energy.py
  60. 68 0
      data/purposeCombined/BI/examples/flights.py
  61. 78 0
      data/purposeCombined/BI/examples/helpers-backup.py
  62. 78 0
      data/purposeCombined/BI/examples/helpers.py
  63. 116 0
      data/purposeCombined/BI/examples/long_lat.py
  64. 224 0
      data/purposeCombined/BI/examples/misc_dashboard-backup.py
  65. 224 0
      data/purposeCombined/BI/examples/misc_dashboard.py
  66. 58 0
      data/purposeCombined/BI/examples/multi_line.py
  67. 117 0
      data/purposeCombined/BI/examples/multiformat_time_series.py
  68. 60 0
      data/purposeCombined/BI/examples/paris.py
  69. 81 0
      data/purposeCombined/BI/examples/random_time_series.py
  70. 62 0
      data/purposeCombined/BI/examples/sf_population_polygons.py
  71. 342 0
      data/purposeCombined/BI/examples/tabbed_dashboard-backup.py
  72. 342 0
      data/purposeCombined/BI/examples/tabbed_dashboard.py
  73. 163 0
      data/purposeCombined/BI/examples/unicode_test_data.py
  74. 574 0
      data/purposeCombined/BI/examples/world_bank.py
  75. 580 0
      data/purposeCombined/BI/income_disparity_final_version_2.py
  76. 338 0
      data/purposeCombined/BI/macro_analysis-backup.py
  77. 338 0
      data/purposeCombined/BI/macro_analysis.py
  78. 662 0
      data/purposeCombined/BI/practica3.py
  79. 98 0
      data/purposeCombined/Directory/IOTA2Directory.py
  80. 31 0
      data/purposeCombined/Directory/advance_touch.py
  81. 213 0
      data/purposeCombined/Directory/augmentation_main.py
  82. 92 0
      data/purposeCombined/Directory/conftest.py
  83. 394 0
      data/purposeCombined/Directory/data_preprocessing_utils.py
  84. 122 0
      data/purposeCombined/Directory/diml_to_interiornet.py
  85. 177 0
      data/purposeCombined/Directory/ego_to_json.py
  86. 107 0
      data/purposeCombined/Directory/esquema.py
  87. 41 0
      data/purposeCombined/Directory/file_handler.py
  88. 130 0
      data/purposeCombined/Directory/generate_directories.py
  89. 167 0
      data/purposeCombined/Directory/logging.py
  90. 27 0
      data/purposeCombined/Directory/make_folder.py
  91. 90 0
      data/purposeCombined/Directory/mkdir.py
  92. 135 0
      data/purposeCombined/Directory/mkdirPypi.py
  93. 12 0
      data/purposeCombined/Directory/mkdir_p.py
  94. 80 0
      data/purposeCombined/Directory/project_creator.py
  95. 206 0
      data/purposeCombined/Directory/setup.py
  96. 49 0
      data/purposeCombined/Directory/split_data_in_k_folds.py
  97. 80 0
      data/purposeCombined/Directory/stc_vid2frames.py
  98. 197 0
      data/purposeCombined/Directory/test_archive.py
  99. 306 0
      data/purposeCombined/Directory/test_tool.py
  100. 272 0
      data/purposeCombined/Directory/tutorial.py

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+.DS_Store
+*.pyc
+__pycache__

+ 3 - 0
.idea/.gitignore

@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml

+ 10 - 0
.idea/GnnForPrivacyScan.iml

@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

+ 15 - 0
.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,15 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="2">
+            <item index="0" class="java.lang.String" itemvalue="requests" />
+            <item index="1" class="java.lang.String" itemvalue="Flask" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

+ 6 - 0
.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 7 - 0
.idea/misc.xml

@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (base)" project-jdk-type="Python SDK" />
+  <component name="PyCharmProfessionalAdvertiser">
+    <option name="shown" value="true" />
+  </component>
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/GnnForPrivacyScan.iml" filepath="$PROJECT_DIR$/.idea/GnnForPrivacyScan.iml" />
+    </modules>
+  </component>
+</project>

+ 30 - 0
.idea/vcs.xml

@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/Azure/AzureStorage" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/Azure/azure-multiapi-storage-python" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/Azure/python-text-classification" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/BI/BusinessIntelligence-Kaggle" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/Hash/Encryption_And_Hashing" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/ML/ML-In-Action" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/ML/Machine-Learining-Security" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/ML/Machine-Learning" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/ML/Machine_Learning_and_Having_It_Deep_and_Structured" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/NATS/NatsExample" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/NATS/asyncio-nats-examples" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/Pseudonym/Data-Masking" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/S3/NatsExample" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/S3/odoo-s3-storage" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/S3/s3-concat" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/archive/auto-archiver" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/kafka/Calories-Alert-Kafka" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/kafka/MessageCorps" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/kafka/ai-project-fraud-detection" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/kafka/kafka-fraud-detector" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/kafka/kafkaesk" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/kafka/scrapy-kafka" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/kafka/tail2kafka" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/data/purposeCombined/visualize/Visualization-of-popular-algorithms-in-Python" vcs="Git" />
+  </component>
+</project>


+ 65 - 0
data/edge.txt

@@ -0,0 +1,65 @@
+name 1
+asname 2
+target 3
+annotation 4
+value 5
+simple 6
+arg 7
+type_comment 8
+posonlyargs 9
+args 10
+vararg 11
+kwonlyargs 12
+kw_defaults 13
+kwarg 14
+defaults 15
+test 16
+msg 17
+targets 18
+iter 19
+body 20
+orelse 21
+decorator_list 22
+returns 23
+items 24
+attr 25
+ctx 26
+op 27
+left 28
+right 29
+values 30
+func 31
+keywords 32
+bases 33
+ops 34
+comparators 35
+ifs 36
+is_async 37
+kind 38
+keys 39
+key 40
+generators 41
+type 42
+conversion 43
+format_spec 44
+argtypes 45
+elt 46
+names 47
+module 48
+level 49
+elts 50
+type_ignores 51
+id 52
+exc 53
+cause 54
+lower 55
+upper 56
+step 57
+slice 58
+handlers 59
+finalbody 60
+lineno 61
+tag 62
+operand 63
+context_expr 64
+optional_vars 65

+ 108 - 0
data/node.txt

@@ -0,0 +1,108 @@
+AST 1
+operator 2
+Add 3
+alias 4
+boolop 5
+And 6
+stmt 7
+AnnAssign 8
+arg 9
+arguments 10
+Assert 11
+Assign 12
+AsyncFor 13
+AsyncFunctionDef 14
+AsyncWith 15
+expr 16
+Attribute 17
+AugAssign 18
+Await 19
+BinOp 20
+BitAnd 21
+BitOr 22
+BitXor 23
+BoolOp 24
+Break 25
+Call 26
+ClassDef 27
+cmpop 28
+Compare 29
+comprehension 30
+Constant 31
+Continue 32
+expr_context 33
+Del 34
+Delete 35
+Dict 36
+DictComp 37
+Div 38
+Eq 39
+excepthandler 40
+ExceptHandler 41
+Expr 42
+mod 43
+Expression 44
+FloorDiv 45
+For 46
+FormattedValue 47
+FunctionDef 48
+FunctionType 49
+GeneratorExp 50
+Global 51
+Gt 52
+GtE 53
+If 54
+IfExp 55
+Import 56
+ImportFrom 57
+In 58
+Interactive 59
+unaryop 60
+Invert 61
+Is 62
+IsNot 63
+JoinedStr 64
+keyword 65
+Lambda 66
+List 67
+ListComp 68
+Load 69
+LShift 70
+Lt 71
+LtE 72
+MatMult 73
+Mod 74
+Module 75
+Mult 76
+Name 77
+NamedExpr 78
+Nonlocal 79
+Not 80
+NotEq 81
+NotIn 82
+Or 83
+Pass 84
+Pow 85
+Raise 86
+Return 87
+RShift 88
+Set 89
+SetComp 90
+Slice 91
+Starred 92
+Store 93
+Sub 94
+Subscript 95
+Try 96
+Tuple 97
+type_ignore 98
+TypeIgnore 99
+UAdd 100
+UnaryOp 101
+USub 102
+While 103
+With 104
+withitem 105
+Yield 106
+YieldFrom 107
+__loader__ 108

BIN
data/purposeCombined/.DS_Store


+ 3 - 0
data/purposeCombined/Azure/.vscode/settings.json

@@ -0,0 +1,3 @@
+{
+    "python.formatting.provider": "black"
+}

+ 82 - 0
data/purposeCombined/Azure/AddUp/Azure-blob-storage.py

@@ -0,0 +1,82 @@
+from azure.storage.blob import BlobClient, BlobServiceClient
+import os
+import requests
+
+def list_files() -> list:
+    file_list = []
+    
+    for root, dirs, files in os.walk("data"):
+        for name in files:
+            file_list.append({"file_name": name, "local_path": os.path.join(root,name)})
+
+    return file_list
+
+def get_filename_from_url(url: str) -> str:
+    file_name=url.split('/')[-1]
+    return file_name
+
+def get_random_images() -> list:
+    # helper function uses loremflickr.com to get a random list of images 
+    images = []
+
+    for i in range(10):
+        resp = requests.get(url=f"https://loremflickr.com/json/320/240?random={i}")
+        resp_json = resp.json()
+        images.append(resp_json["file"])
+
+    return images
+
+def create_blob_from_url(storage_connection_string,container_name):
+    try:
+        # urls to fetch into blob storage
+        url_list = get_random_images()
+
+        # Instantiate a new BlobServiceClient and a new ContainerClient
+        blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string)
+        container_client = blob_service_client.get_container_client(container_name)
+
+        for u in url_list:
+            # Download file from url then upload blob file
+            r = requests.get(u, stream = True)
+            if r.status_code == 200:
+                r.raw.decode_content = True
+                blob_client = container_client.get_blob_client(get_filename_from_url(u))
+                blob_client.upload_blob(r.raw,overwrite=True)
+        return True
+        
+    except Exception as e:
+        print(e.message, e.args)
+        return False 
+
+def create_blob_from_path(storage_connection_string,container_name):
+    try:
+        # Instantiate a new BlobServiceClient and a new ContainerClient
+        blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string)
+        container_client = blob_service_client.get_container_client(container_name)
+
+        for f in list_files():
+            with open(f["local_path"], "rb") as data:
+                blob_client = container_client.get_blob_client(f["file_name"])
+                blob_client.upload_blob(data,overwrite=True)
+        return True
+
+    except Exception as e:
+        print(e.message, e.args)
+        return False
+
+if __name__ == '__main__':
+
+    # get storage account settings
+    storage_connection_string = os.environ.get("STORAGE_CONNECTION_STRING")
+    container_name = os.environ.get("STORAGE_CONTAINER")
+
+    # # if you want to copy from a public url
+    result = create_blob_from_url(storage_connection_string,container_name)
+    
+    # OR if you want to upload form your local drive
+    #create_blob_from_path(storage_connection_string,container_name)
+
+    if(result):
+        print("Done!")
+    else:
+        print("An error occured!")

+ 51 - 0
data/purposeCombined/Azure/AddUp/blob-upload-1.py

@@ -0,0 +1,51 @@
+import os
+from flask import Flask, request, redirect, url_for
+from werkzeug import secure_filename
+from azure.storage.blob import BlockBlobService
+import string, random, requests
+
+app = Flask(__name__, instance_relative_config=True)
+
+app.config.from_pyfile('config.py')
+account = app.config['ACCOUNT']   # Azure account name
+key = app.config['STORAGE_KEY']      # Azure Storage account access key  
+container = app.config['CONTAINER'] # Container name
+
+blob_service = BlockBlobService(account_name=account, account_key=key)
+
+@app.route('/', methods=['GET', 'POST'])
+def upload_file():
+    if request.method == 'POST':
+        file = request.files['file']
+        filename = secure_filename(file.filename)
+        fileextension = filename.rsplit('.',1)[1]
+        Randomfilename = id_generator()
+        filename = Randomfilename + '.' + fileextension
+        try:
+            blob_service.create_blob_from_stream(container, filename, file)
+        except Exception:
+            print ('Exception=' + Exception)
+            pass
+        ref =  'http://'+ account + '.blob.core.windows.net/' + container + '/' + filename
+        return '''
+	    <!doctype html>
+	    <title>File Link</title>
+	    <h1>Uploaded File Link</h1>
+	    <p>''' + ref + '''</p>
+	    <img src="'''+ ref +'''">
+	    '''
+    return '''
+    <!doctype html>
+    <title>Upload new File</title>
+    <h1>Upload new File</h1>
+    <form action="" method=post enctype=multipart/form-data>
+      <p><input type=file name=file>
+         <input type=submit value=Upload>
+    </form>
+    '''
+
+def id_generator(size=32, chars=string.ascii_uppercase + string.digits):
+    return ''.join(random.choice(chars) for _ in range(size))
+
+if __name__ == '__main__':
+    app.run(debug=True)

+ 221 - 0
data/purposeCombined/Azure/AddUp/blob-upload-2.py

@@ -0,0 +1,221 @@
+import mimetypes
+import datetime
+
+from azure.common import AzureMissingResourceHttpError
+from azure.storage.blob import BlobService
+
+from django.core.files.storage import Storage
+from django.conf import settings
+
+try:
+    from django.utils.deconstruct import deconstructible
+except ImportError:
+    # Support for django 1.7 and below
+    def deconstructible(func):
+        return func
+
+
+@deconstructible
+class AzureStorage(Storage):
+    """
+    Custom file storage system for Azure
+    """
+
+    container = settings.AZURE_STORAGE.get('CONTAINER')
+    account_name = settings.AZURE_STORAGE.get('ACCOUNT_NAME')
+    account_key = settings.AZURE_STORAGE.get('ACCOUNT_KEY')
+    cdn_host = settings.AZURE_STORAGE.get('CDN_HOST')
+    use_ssl = settings.AZURE_STORAGE.get('USE_SSL')
+
+    def __init__(self, account_name=None, account_key=None, container=None,
+         use_ssl=None, cdn_host=None):
+
+        if account_name is not None:
+            self.account_name = account_name
+
+        if account_key is not None:
+            self.account_key = account_key
+
+        if container is not None:
+            self.container = container
+
+        if use_ssl is not None:
+            self.use_ssl = use_ssl
+
+        if cdn_host is not None:
+            self.cdn_host = cdn_host
+
+    def __getstate__(self):
+        return dict(
+            account_name=self.account_name,
+            account_key=self.account_key,
+            container=self.container,
+            cdn_host=self.cdn_host,
+            use_ssl=self.use_ssl
+        )
+
+    def _get_service(self):
+        if not hasattr(self, '_blob_service'):
+            self._blob_service = BlobService(
+                account_name=self.account_name,
+                account_key=self.account_key,
+                protocol='https' if self.use_ssl else 'http'
+            )
+
+        return self._blob_service
+
+    def _get_properties(self, name):
+        return self._get_service().get_blob_properties(
+            container_name=self.container,
+            blob_name=name
+        )
+
+    def _open(self, name, mode='rb'):
+        """
+        Return the AzureStorageFile.
+        """
+
+        from django.core.files.base import ContentFile
+
+        contents = self._get_service().get_blob_to_bytes(
+            container_name=self.container,
+            blob_name=name
+        )
+
+        return ContentFile(contents)
+
+    def _save(self, name, content):
+        """
+        Use the Azure Storage service to write ``content`` to a remote file
+        (called ``name``).
+        """
+        
+
+        content.open()
+
+        content_type = None
+
+        if hasattr(content.file, 'content_type'):
+            content_type = content.file.content_type
+        else:
+            content_type = mimetypes.guess_type(name)[0]
+
+        cache_control = self.get_cache_control(
+            self.container,
+            name,
+            content_type
+        )
+
+        self._get_service().put_block_blob_from_file(
+            container_name=self.container,
+            blob_name=name,
+            stream=content,
+            x_ms_blob_content_type=content_type,
+            cache_control=cache_control,
+            x_ms_blob_cache_control=cache_control
+        )
+
+        content.close()
+
+        return name
+
+    def listdir(self, path):
+        """
+        Lists the contents of the specified path, returning a 2-tuple of lists;
+        the first item being directories, the second item being files.
+        """
+
+        files = []
+
+        if path and not path.endswith('/'):
+            path = '%s/' % path
+
+        path_len = len(path)
+
+        if not path:
+            path = None
+
+        blob_list = self._get_service().list_blobs(self.container, prefix=path)
+
+        for name in blob_list:
+            files.append(name[path_len:])
+
+        return ([], files)
+
+    def exists(self, name):
+        """
+        Returns True if a file referenced by the given name already exists in
+        the storage system, or False if the name is available for a new file.
+        """
+        try:
+            self._get_properties(name)
+
+            return True
+        except AzureMissingResourceHttpError:
+            return False
+
+    def delete(self, name):
+        """
+        Deletes the file referenced by name.
+        """
+
+        try:
+            self._get_service().delete_blob(self.container, name)
+        except AzureMissingResourceHttpError:
+            pass
+
+    def get_cache_control(self, container, name, content_type):
+        """
+        Get the Cache-Control value for a blob, used when saving the blob on
+        Azure.  Returns `None` by default to remain compatible with the
+        default setting for the SDK.
+        """
+
+        return None
+
+    def size(self, name):
+        """
+        Returns the total size, in bytes, of the file referenced by name.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return int(properties['content-length'])
+        except AzureMissingResourceHttpError:
+            pass
+
+    def url(self, name):
+        """
+        Returns the URL where the contents of the file referenced by name can
+        be accessed.
+        """
+
+        blob_url_args = {
+            'container_name': self.container,
+            'blob_name': name,
+        }
+
+        if self.cdn_host:
+            # The account name should be built into the cdn hostname
+            blob_url_args['account_name'] = ''
+            blob_url_args['host_base'] = self.cdn_host
+
+        return self._get_service().make_blob_url(
+            **blob_url_args
+        )
+
+    def modified_time(self, name):
+        """
+        Returns a datetime object containing the last modified time.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return datetime.datetime.strptime(
+                properties['last-modified'],
+                '%a, %d %b %Y %H:%M:%S %Z'
+            )
+        except AzureMissingResourceHttpError:
+            pass

+ 107 - 0
data/purposeCombined/Azure/AddUp/blob-upload.py

@@ -0,0 +1,107 @@
+# ----------------------------------------------------------------------------------
+# MIT License
+#
+# Copyright(c) Microsoft Corporation. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# ----------------------------------------------------------------------------------
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+
+import os
+import uuid
+import sys
+from azure.storage.blob import BlockBlobService, PublicAccess
+
+# ---------------------------------------------------------------------------------------------------------
+# Method that creates a test file in the 'Sample' folder.
+# This sample application creates a test file, uploads the test file to the Blob storage,
+# lists the blobs in the container, and downloads the file with a new name.
+# ---------------------------------------------------------------------------------------------------------
+# Documentation References:
+# Associated Article - https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python
+# What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+# Getting Started with Blobs-https://docs.microsoft.com/en-us/azure/storage/blobs/storage-python-how-to-use-blob-storage
+# Blob Service Concepts - http://msdn.microsoft.com/en-us/library/dd179376.aspx
+# Blob Service REST API - http://msdn.microsoft.com/en-us/library/dd135733.aspx
+# ----------------------------------------------------------------------------------------------------------
+
+
+def run_sample():
+    try:
+        # Create the BlockBlobService that is used to call the Blob service for the storage account
+        blob_service_client = BlockBlobService(
+            account_name='accountname', account_key='accountkey')
+
+        # Create a container called 'quickstartblobs'.
+        container_name = 'quickstartblobs'
+        blob_service_client.create_container(container_name)
+
+        # Set the permission so the blobs are public.
+        blob_service_client.set_container_acl(
+            container_name, public_access=PublicAccess.Container)
+
+        # Create Sample folder if it not exists, and create a file in folder Sample to test the upload and download.
+        local_path = os.path.expanduser("~/Sample")
+        if not os.path.exists(local_path):
+            os.makedirs(os.path.expanduser("~/Sample"))
+        local_file_name = "QuickStart_" + str(uuid.uuid4()) + ".txt"
+        full_path_to_file = os.path.join(local_path, local_file_name)
+
+        # Write text to the file.
+        file = open(full_path_to_file,  'w')
+        file.write("Hello, World!")
+        file.close()
+
+        print("Temp file = " + full_path_to_file)
+        print("\nUploading to Blob storage as blob" + local_file_name)
+
+        # Upload the created file, use local_file_name for the blob name
+        blob_service_client.create_blob_from_path(
+            container_name, local_file_name, full_path_to_file)
+
+        # List the blobs in the container
+        print("\nList blobs in the container")
+        generator = blob_service_client.list_blobs(container_name)
+        for blob in generator:
+            print("\t Blob name: " + blob.name)
+
+        # Download the blob(s).
+        # Add '_DOWNLOADED' as prefix to '.txt' so you can see both files in Documents.
+        full_path_to_file2 = os.path.join(local_path, str.replace(
+            local_file_name ,'.txt', '_DOWNLOADED.txt'))
+        print("\nDownloading blob to " + full_path_to_file2)
+        blob_service_client.get_blob_to_path(
+            container_name, local_file_name, full_path_to_file2)
+
+        sys.stdout.write("Sample finished running. When you hit <any key>, the sample will be deleted and the sample "
+                         "application will exit.")
+        sys.stdout.flush()
+        input()
+
+        # Clean up resources. This includes the container and the temp files
+        blob_service_client.delete_container(container_name)
+        os.remove(full_path_to_file)
+        os.remove(full_path_to_file2)
+    except Exception as e:
+        print(e)
+
+
+# Main method.
+if __name__ == '__main__':
+    run_sample()

+ 231 - 0
data/purposeCombined/Azure/AddUp/circuitbreaker.py

@@ -0,0 +1,231 @@
+# ----------------------------------------------------------------------------------
+# MIT License
+#
+# Copyright(c) Microsoft Corporation. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# ----------------------------------------------------------------------------------
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ----------------------------------------------------------------------------------
+
+import os
+import uuid
+import time
+import sys
+from azure.storage.blob import BlockBlobService
+from azure.storage.common.models import LocationMode
+from azure.storage.common.retry import LinearRetry
+
+
+# ----------------------------------------------------------------------------------
+# Azure Storage Circuit Breaker Demo
+# INSTRUCTIONS
+# Please see the README.md file for an overview explaining this application and how to run it.
+# ----------------------------------------------------------------------------------
+# Documentation References:
+# Associated Article - https://docs.microsoft.com/en-us/azure/storage/blobs/storage-create-geo-redundant-storage-python
+# Designing HA Apps with RA-GRS storage -https://docs.microsoft.com/azure/storage/storage-designing-ha-apps-with-ra-grs/
+# Getting Started with Blobs-https://docs.microsoft.com/en-us/azure/storage/blobs/storage-python-how-to-use-blob-storage
+# Azure Storage Replication - https://docs.microsoft.com/azure/storage/storage-redundancy
+# ----------------------------------------------------------------------------------
+
+account_name = os.environ.get('accountname')
+account_key = os.environ.get('accountkey')
+
+# Track how many times retry events occur.
+retry_count = 0  # Number of retries that have occurred
+retry_threshold = 5  # Threshold number of retries before switching to secondary
+secondary_read_count = 0  # Number of reads from secondary that have occurred
+secondary_threshold = 20  # Threshold number of reads from secondary before switching back to primary
+
+# This is the CloudBlobClient object used to access the blob service
+blob_client = None
+
+# This is the container used to store and access the blob to be used for testing
+container_name = None
+
+'''
+Main method. Sets up the objects needed, the performs a loop to perform blob
+ operation repeatedly, responding to the Retry and Response Received events.
+'''
+
+
+def run_circuit_breaker():
+    # Name of image to use for testing.
+    image_to_upload = "HelloWorld.png"
+
+    global blob_client
+    global container_name
+    try:
+
+        # Create a reference to the blob client and container using the storage account name and key
+        blob_client = BlockBlobService(account_name, account_key)
+
+        # Make the container unique by using a UUID in the name.
+        container_name = "democontainer" + str(uuid.uuid4())
+        blob_client.create_container(container_name)
+
+    except Exception as ex:
+        print("Please make sure you have put the correct storage account name and key.")
+        print(ex)
+
+    # Define a reference to the actual blob and upload the block_blob to the newly created container
+    full_path_to_file = os.path.join(os.path.dirname(__file__), image_to_upload)
+    blob_client.create_blob_from_path(container_name, image_to_upload, full_path_to_file)
+
+    # Set the location mode to secondary, so you can check just the secondary data center.
+    blob_client.location_mode = LocationMode.SECONDARY
+    blob_client.retry = LinearRetry(backoff=0).retry
+
+    # Before proceeding, wait until the blob has been replicated to the secondary data center.
+    # Loop and check for the presence of the blob once in a second until it hits 60 seconds
+    # or until it finds it
+    counter = 0
+    while counter < 60:
+        counter += 1
+        sys.stdout.write("\nAttempt {0} to see if the blob has replicated to the secondary storage yet.".format(counter))
+        sys.stdout.flush()
+        if blob_client.exists(container_name, image_to_upload):
+            break
+
+        # Wait a second, then loop around and try again
+        # When it's finished replicating to the secondary, continue.
+        time.sleep(1)
+
+    # Set the starting LocationMode to Primary, then Secondary.
+    # Here we use the linear retry by default, but allow it to retry to secondary if
+    # the initial request to primary fails.
+    # Note that the default is Primary. You must have RA-GRS enabled to use this
+    blob_client.location_mode = LocationMode.PRIMARY
+    blob_client.retry = LinearRetry(max_attempts=retry_threshold, backoff=1).retry
+
+    ''' 
+        ************INSTRUCTIONS**************k
+        To perform the test, first replace the 'accountname' and 'accountkey' with your storage account name and key.
+        Every time it calls get_blob_to_path it will hit the response_callback function.
+
+        Next, run this app. While this loop is running, pause the program by pressing any key, and
+        put the intercept code in Fiddler (that will intercept and return a 503).
+
+        For instructions on modifying Fiddler, look at the Fiddler_script.text file in this project.
+        There are also full instructions in the ReadMe_Instructions.txt file included in this project.
+
+        After adding the custom script to Fiddler, calls to primary storage will fail with a retryable
+        error which will trigger the Retrying event (above).
+        Then it will switch over and read the secondary. It will do that 20 times, then try to
+        switch back to the primary.
+        After seeing that happen, pause this again and remove the intercepting Fiddler code
+        Then you'll see it return to the primary and finish.
+        '''
+
+    print("\n\nThe application will pause at 200 unit interval")
+
+    for i in range(0, 1000):
+        if blob_client.location_mode == LocationMode.SECONDARY:
+            sys.stdout.write("S{0} ".format(str(i)))
+        else:
+            sys.stdout.write("P{0} ".format(str(i)))
+        sys.stdout.flush()
+
+        try:
+
+            # These function is called immediately after retry evaluation is performed.
+            # It is used to trigger the change from primary to secondary and back
+            blob_client.retry_callback = retry_callback
+
+            # Download the file
+            blob_client.get_blob_to_path(container_name, image_to_upload,
+                                                str.replace(full_path_to_file, ".png", "Copy.png"))
+
+            # Set the application to pause at 200 unit intervals to implement simulated failures
+            if i == 200 or i == 400 or i == 600 or i == 800:
+                sys.stdout.write("\nPress the Enter key to resume")
+                sys.stdout.flush()
+                if sys.version_info[0] < 3:
+                    raw_input()
+                else:
+                    input()
+        except Exception as ex:
+            print(ex)
+        finally:
+            # Force an exists call to succeed by resetting the status
+            blob_client.response_callback = response_callback
+
+    # Clean up resources
+    blob_client.delete_container(container_name)
+
+
+'''
+RequestCompleted Event handler
+If it's not pointing at the secondary, let it go through. It was either successful,
+or it failed with a non-retryable event.
+If it's pointing at the secondary, increment the read count.
+If the number of reads has hit the threshold of how many reads you want to do against the secondary,
+before you switch back to primary, switch back and reset the secondary_read_count.
+'''
+
+
+def response_callback(response):
+    global secondary_read_count
+    if blob_client.location_mode == LocationMode.SECONDARY:
+
+        # You're reading the secondary. Let it read the secondary [secondaryThreshold] times,
+        # then switch back to the primary and see if it is available now.
+        secondary_read_count += 1
+        if secondary_read_count >= secondary_threshold:
+            blob_client.location_mode = LocationMode.PRIMARY
+            secondary_read_count = 0
+
+
+'''
+Retry Event handler
+If it has retried more times than allowed, and it's not already pointed to the secondary,
+flip it to the secondary and reset the retry count.
+If it has retried more times than allowed, and it's already pointed to the secondary throw an exception.
+'''
+
+
+def retry_callback(retry_context):
+    global retry_count
+    retry_count = retry_context.count
+    sys.stdout.write("\nRetrying event because of failure reading the primary. RetryCount= {0}".format(retry_count))
+    sys.stdout.flush()
+
+    # Check if we have more than n-retries in which case switch to secondary
+    if retry_count >= retry_threshold:
+
+        # Check to see if we can fail over to secondary.
+        if blob_client.location_mode != LocationMode.SECONDARY:
+            blob_client.location_mode = LocationMode.SECONDARY
+            retry_count = 0
+        else:
+            raise Exception("Both primary and secondary are unreachable. "
+                            "Check your application's network connection.")
+
+
+if __name__ == '__main__':
+    print("Azure storage Circuit Breaker Sample \n")
+    try:
+        run_circuit_breaker()
+    except Exception as e:
+        print("Error thrown = {0}".format(e))
+    sys.stdout.write("\nPress any key to exit.")
+    sys.stdout.flush()
+    if sys.version_info[0]<3:
+        raw_input()
+    else:
+        input()

+ 138 - 0
data/purposeCombined/Azure/AddUp/datafactory.py

@@ -0,0 +1,138 @@
+from azure.common.credentials import ServicePrincipalCredentials
+from azure.mgmt.resource import ResourceManagementClient
+from azure.mgmt.datafactory import DataFactoryManagementClient
+from azure.mgmt.datafactory.models import *
+from datetime import datetime, timedelta
+import time
+
+def print_item(group):
+    """Print an Azure object instance."""
+    print("\tName: {}".format(group.name))
+    print("\tId: {}".format(group.id))
+    if hasattr(group, 'location'):
+        print("\tLocation: {}".format(group.location))
+    if hasattr(group, 'tags'):
+        print("\tTags: {}".format(group.tags))
+    if hasattr(group, 'properties'):
+        print_properties(group.properties)
+    print("\n")        
+
+def print_properties(props):
+    """Print a ResourceGroup properties instance."""
+    if props and hasattr(props, 'provisioning_state') and props.provisioning_state:
+        print("\tProperties:")
+        print("\t\tProvisioning State: {}".format(props.provisioning_state))
+    print("\n")
+
+def print_activity_run_details(activity_run):
+    """Print activity run details."""
+    print("\n\tActivity run details\n")
+    print("\tActivity run status: {}".format(activity_run.status))    
+    if activity_run.status == 'Succeeded':
+        print("\tNumber of bytes read: {}".format(activity_run.output['dataRead']))       
+        print("\tNumber of bytes written: {}".format(activity_run.output['dataWritten']))           
+        print("\tCopy duration: {}".format(activity_run.output['copyDuration']))           
+    else:
+        print("\tErrors: {}".format(activity_run.error['message']))
+
+def main():
+
+    # Azure subscription ID
+    subscription_id = '<Azure subscription ID>'
+
+    # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group
+    rg_name = '<Azure resource group name>'
+
+    # The data factory name. It must be globally unique.
+    df_name = '<Data factory name>'        
+
+    # Specify your Active Directory client ID, client secret, and tenant ID
+    credentials = ServicePrincipalCredentials(client_id='<AAD application ID>', secret='<AAD app authentication key>', tenant='<AAD tenant ID>')
+    resource_client = ResourceManagementClient(credentials, subscription_id)
+    adf_client = DataFactoryManagementClient(credentials, subscription_id)
+
+    rg_params = {'location':'eastus'}
+    df_params = {'location':'eastus'}
+
+    # create the resource group
+    # comment out if the resource group already exits
+    resource_client.resource_groups.create_or_update(rg_name, rg_params)
+
+    # Create a data factory
+    df_resource = Factory(location='eastus')
+    df = adf_client.factories.create_or_update(rg_name, df_name, df_resource)
+    print_item(df)
+    while df.provisioning_state != 'Succeeded':
+        df = adf_client.factories.get(rg_name, df_name)
+        time.sleep(1)
+
+    # Create an Azure Storage linked service
+    ls_name = 'storageLinkedService'
+
+    # Specify the name and key of your Azure Storage account
+    storage_string = SecureString('DefaultEndpointsProtocol=https;AccountName=<Azure storage account>;AccountKey=<Azure storage authentication key>')
+
+    ls_azure_storage = AzureStorageLinkedService(connection_string=storage_string)
+    ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage)
+    print_item(ls)
+
+    # Create an Azure blob dataset (input)
+    ds_name = 'ds_in'
+    ds_ls = LinkedServiceReference(ls_name)
+    blob_path= 'adftutorial/inputpy'
+    blob_filename = 'input.txt'
+    ds_azure_blob= AzureBlobDataset(ds_ls, folder_path=blob_path, file_name = blob_filename)
+    ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob)
+    print_item(ds)
+
+    # Create an Azure blob dataset (output)
+    dsOut_name = 'ds_out'
+    output_blobpath = 'adftutorial/outputpy'
+    dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath)
+    dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob)
+    print_item(dsOut)
+
+    # Create a copy activity
+    act_name =  'copyBlobtoBlob'
+    blob_source = BlobSource()
+    blob_sink = BlobSink()
+    dsin_ref = DatasetReference(ds_name)
+    dsOut_ref = DatasetReference(dsOut_name)
+    copy_activity = CopyActivity(act_name,inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink)
+
+    # Create a pipeline with the copy activity
+    p_name =  'copyPipeline'
+    params_for_pipeline = {}
+    p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline)
+    p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj)
+    print_item(p)
+
+    # Create a pipeline run
+    run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name,
+        {
+        }
+    )
+
+    # Monitor the pipeilne run
+    time.sleep(30)
+    pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id)
+    print("\n\tPipeline run status: {}".format(pipeline_run.status))
+    activity_runs_paged = list(adf_client.activity_runs.list_by_pipeline_run(rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1),  datetime.now() + timedelta(1)))
+    print_activity_run_details(activity_runs_paged[0])
+
+    # Create a trigger
+    tr_name = 'mytrigger'
+    scheduler_recurrence = ScheduleTriggerRecurrence(frequency='Minute', interval='15',start_time=datetime.now(), end_time=datetime.now() + timedelta(1), time_zone='UTC') 
+    pipeline_parameters = {'inputPath':'adftutorial/inputpy', 'outputPath':'adftutorial/outputpy'}
+    pipelines_to_run = []
+    pipeline_reference = PipelineReference('copyPipeline')
+    pipelines_to_run.append(TriggerPipelineReference(pipeline_reference, pipeline_parameters))
+    tr_properties = ScheduleTrigger(description='My scheduler trigger', pipelines = pipelines_to_run, recurrence=scheduler_recurrence)    
+    adf_client.triggers.create_or_update(rg_name, df_name, tr_name, tr_properties)
+
+    # start the trigger
+    adf_client.triggers.start(rg_name, df_name, tr_name)
+    
+
+# Start the main method
+main()

+ 202 - 0
data/purposeCombined/Azure/AddUp/file_advanced_samples.py

@@ -0,0 +1,202 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+
+import os
+from random_data import RandomData
+
+from azure.storage.fileshare import ShareServiceClient
+from azure.storage.fileshare import CorsRule, RetentionPolicy, Metrics
+
+#
+# Azure File Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure File Service.  
+#  
+# Documentation References:  
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/  
+#  - Getting Started with Files - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-file-storage/  
+#  - File Service Concepts - http://msdn.microsoft.com/en-us/library/dn166972.aspx  
+#  - File Service REST API - http://msdn.microsoft.com/en-us/library/dn167006.aspx  
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#  
+class FileAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+    
+    # Runs all samples for Azure Storage File service.
+    def run_all_samples(self, connection_string):
+        print('Azure Storage File Advanced samples - Starting.')
+        
+        try:
+            # Create an instance of ShareServiceClient
+            service = ShareServiceClient.from_connection_string(conn_str=connection_string)
+
+            # List shares
+            print('\n\n* List shares *\n')
+            self.list_shares(service)
+
+            # Set Cors
+            print('\n\n* Set cors rules *\n')
+            self.set_cors_rules(service)
+
+            # Set Service Properties
+            print('\n\n* Set service properties *\n')
+            self.set_service_properties(service)
+
+            # Share, directory and file properties and metadata
+            print('\n\n* Metadata and properties *\n')
+            self.metadata_and_properties(service)
+
+        except Exception as e:
+            print('Error occurred in the sample.', e) 
+
+        finally:
+            print('\nAzure Storage File Advanced samples - Completed.\n')
+    
+    # List file shares
+    def list_shares(self, service):
+        share_prefix = 'sharesample' + self.random_data.get_random_name(6)
+
+        try:        
+            print('1. Create multiple shares with prefix: ', share_prefix)
+            for i in range(5):
+                service.create_share(share_name=share_prefix + str(i))
+            
+            print('2. List shares')
+            shares = service.list_shares()
+            for share in shares:
+                print('  Share name:' + share.name)
+
+        except Exception as e:
+            print(e) 
+
+        finally:
+            print('3. Delete shares with prefix:' + share_prefix) 
+            for i in range(5):
+                service.delete_share(share_prefix + str(i))
+    
+
+    # Set CORS
+    def set_cors_rules(self, service):
+        print('1. Get Cors Rules')
+        original_cors_rules = service.get_service_properties()['cors']
+
+        print('2. Overwrite Cors Rules')
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+
+        try:
+            service.set_service_properties(cors=[cors_rule])
+        except Exception as e:
+            print(e)
+        finally:
+            #reverting cors rules back to the original ones
+            print('3. Revert Cors Rules back the original ones')
+            service.set_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+    
+
+    # Manage properties of the File service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, service):
+
+        print('1. Get File service properties')
+        props = service.get_service_properties()
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite File service properties')
+            service.set_service_properties(hour_metrics=hour_metrics, minute_metrics=minute_metrics)
+
+        finally:
+            print('3. Revert File service properties back to the original ones')
+            service.set_service_properties(hour_metrics=props['hour_metrics'], minute_metrics=props['minute_metrics'])
+
+        print('4. Set File service properties completed')
+    
+
+    # Manage metadata and properties of the share
+    def metadata_and_properties(self, service):
+        share_name = 'sharename' + self.random_data.get_random_name(6)
+
+        try:
+            # All directories and share must be created in a parent share.
+            # Max capacity: 5TB per share
+            print('1. Create sample share with name ' + share_name)
+            quota = 1 # in GB
+            metadata = { "foo": "bar", "baz": "foo" }
+            share_client = service.create_share(share_name=share_name)
+            print('Sample share "'+ share_name +'" created.')
+
+            print('2. Get share properties.')
+            properties = share_client.get_share_properties()
+
+            print('3. Get share metadata.')
+            get_metadata = properties['metadata']
+            for k, v in get_metadata.items():
+                print("\t" + k + ": " + v)
+
+            dir_name = 'dirname' + self.random_data.get_random_name(6)
+
+            print('4. Create sample directory with name ' + dir_name)
+            metadata = { "abc": "def", "jkl": "mno" }
+            directory_client = share_client.create_directory(dir_name, metadata=metadata)
+            print('Sample directory "'+ dir_name +'" created.')
+
+            print('5. Get directory properties.')
+            properties = directory_client.get_directory_properties()
+            
+            print('6. Get directory metadata.')
+            get_metadata = properties['metadata']
+            for k, v in get_metadata.items():
+                print("\t" + k + ": " + v)
+
+            file_name = 'sample.txt'
+            # Uploading text to share_name/dir_name/sample.txt in Azure Files account.
+            # Max capacity: 1TB per file
+            print('7. Upload sample file from text to directory.')
+            metadata = { "prop1": "val1", "prop2": "val2" }
+            file_client = directory_client.get_file_client(file_name)
+            file_client.upload_file('Hello World! - from text sample', metadata=metadata)
+            print('Sample file "' + file_name + '" created and uploaded to: ' + share_name + '/' + dir_name)        
+
+            print('8. Get file properties.')
+            properties = file_client.get_file_properties()
+
+            print('9. Get file metadata.')
+            get_metadata = properties['metadata']
+            for k, v in get_metadata.items():
+                print("\t" + k + ": " + v)
+
+            # This is for demo purposes, all files will be deleted when share is deleted
+            print('10. Delete file.')
+            file_client.delete_file()
+
+            # This is for demo purposes, all directories will be deleted when share is deleted
+            print('11. Delete directory.')
+            directory_client.delete_directory()
+
+        finally:
+            print('12. Delete share.')
+            share_client.delete_share(share_name)
+
+        print("Metadata and properties sample completed")

+ 190 - 0
data/purposeCombined/Azure/AddUp/file_basic_samples.py

@@ -0,0 +1,190 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+
+from random_data import RandomData
+import tempfile
+import os
+
+from azure.storage.fileshare import ShareServiceClient
+
+
+class FileBasicSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage File service.
+    def run_all_samples(self, connection_string):
+        print('Azure Storage File Basis samples - Starting.')
+        
+        #declare variables
+        filename = 'filesample' + self.random_data.get_random_name(6)
+        sharename = 'sharesample' + self.random_data.get_random_name(6)
+        
+        try:
+            # Create an instance of ShareServiceClient
+            service = ShareServiceClient.from_connection_string(conn_str=connection_string)
+
+            print('\n\n* Basic file operations *\n')
+            self.basic_file_operations(sharename, filename, service)
+
+        except Exception as e:
+            print('error:' + e) 
+
+        finally:
+            # Delete all Azure Files created in this sample
+            self.file_delete_samples(sharename, filename, service)
+
+        print('\nAzure Storage File Basic samples - Completed.\n')
+    
+    def basic_file_operations(self, sharename, filename, service):
+        # Creating an SMB file share in your Azure Files account.
+        print('\nAttempting to create a sample file from text for upload demonstration.')   
+        # All directories and share must be created in a parent share.
+        # Max capacity: 5TB per share
+
+        print('Creating sample share.')
+        share_client = service.create_share(share_name=sharename)
+        print('Sample share "'+ sharename +'" created.')
+
+
+        # Creating an optional file directory in your Azure Files account.
+        print('Creating a sample directory.')    
+        # Get the directory client
+        directory_client = share_client.create_directory("mydirectory")
+        print('Sample directory "mydirectory" created.')
+
+
+        # Uploading text to sharename/mydirectory/my_text_file in Azure Files account.
+        # Max capacity: 1TB per file
+        print('Uploading a sample file from text.')   
+        # create_file_client
+        file_client = directory_client.get_file_client(filename)
+        # Upload a file
+        file_client.upload_file('Hello World! - from text sample')
+        print('Sample file "' + filename + '" created and uploaded to: ' + sharename + '/mydirectory')
+  
+
+        # Demonstrate how to copy a file
+        print('\nCopying file ' + filename)
+        # Create another file client which will copy the file from url
+        destination_file_client = share_client.get_file_client('file1copy')
+
+        # Copy the sample source file from the url to the destination file
+        copy_resp = destination_file_client.start_copy_from_url(source_url=file_client.url)
+        if copy_resp['copy_status'] ==  'pending':
+            # Demonstrate how to abort a copy operation (just for demo, probably will never get here)
+            print('Abort copy operation')
+            destination_file.abort_copy()
+        else:
+            print('Copy was a ' + copy_resp['copy_status'])
+        
+
+        # Demonstrate how to create a share and upload a file from a local temporary file path
+        print('\nAttempting to upload a sample file from path for upload demonstration.')  
+        # Creating a temporary file to upload to Azure Files
+        print('Creating a temporary file from text.') 
+        with tempfile.NamedTemporaryFile(delete=False) as my_temp_file:
+            my_temp_file.file.write(b"Hello world!")
+        print('Sample temporary file created.') 
+
+        # Uploading my_temp_file to sharename folder in Azure Files
+        # Max capacity: 1TB per file
+        print('Uploading a sample file from local path.')
+        # Create file_client
+        file_client = share_client.get_file_client(filename)
+
+        # Upload a file
+        with open(my_temp_file.name, "rb") as source_file:
+            file_client.upload_file(source_file)
+
+        print('Sample file "' + filename + '" uploaded from path to share: ' + sharename)
+
+        # Close the temp file
+        my_temp_file.close()
+
+        # Get the list of valid ranges and write to the specified range
+        print('\nGet list of valid ranges of the file.') 
+        file_ranges = file_client.get_ranges()
+
+        data = b'abcdefghijkl'
+        print('Put a range of data to the file.')
+        
+        file_client.upload_range(data=data, offset=file_ranges[0]['start'], length=len(data))
+
+
+        # Demonstrate how to download a file from Azure Files
+        # The following example download the file that was previously uploaded to Azure Files
+        print('\nAttempting to download a sample file from Azure files for demonstration.')
+
+        destination_file = tempfile.tempdir + '\mypathfile.txt'
+
+        with open(destination_file, "wb") as file_handle:
+            data = file_client.download_file()
+            data.readinto(file_handle)
+
+        print('Sample file downloaded to: ' + destination_file)
+
+
+        # Demonstrate how to list files and directories contains under Azure File share
+        print('\nAttempting to list files and directories directory under share "' + sharename + '":')
+
+        # Create a generator to list directories and files under share
+        # This is not a recursive listing operation
+        generator = share_client.list_directories_and_files()
+
+        # Prints the directories and files under the share
+        for file_or_dir in generator:
+            print(file_or_dir['name'])
+        
+        # remove temp file
+        os.remove(my_temp_file.name)
+
+        print('Files and directories under share "' + sharename + '" listed.')
+        print('\nCompleted successfully - Azure basic Files operations.')
+
+
+    # Demonstrate how to delete azure files created for this demonstration
+    # Warning: Deleting a share or directory will also delete all files and directories that are contained in it.
+    def file_delete_samples(self, sharename, filename, service):
+        print('\nDeleting all samples created for this demonstration.')
+
+        try:
+            # Deleting file: 'sharename/mydirectory/filename'
+            # This is for demo purposes only, it's unnecessary, as we're deleting the share later
+            print('Deleting a sample file.')
+
+            share_client = service.get_share_client(sharename)
+            directory_client = share_client.get_directory_client('mydirectory')
+            
+            directory_client.delete_file(file_name=filename)
+            print('Sample file "' + filename + '" deleted from: ' + sharename + '/mydirectory' )
+
+            # Deleting directory: 'sharename/mydirectory'
+            print('Deleting sample directory and all files and directories under it.')
+            share_client.delete_directory('mydirectory')
+            print('Sample directory "/mydirectory" deleted from: ' + sharename)
+
+            # Deleting share: 'sharename'
+            print('Deleting sample share ' + sharename + ' and all files and directories under it.')
+            share_client.delete_share(sharename)
+            print('Sample share "' + sharename + '" deleted.')
+
+            print('\nCompleted successfully - Azure Files samples deleted.')
+
+        except Exception as e:
+            print('********ErrorDelete***********')
+            print(e)

+ 415 - 0
data/purposeCombined/Azure/AddUp/python-quick-start.py

@@ -0,0 +1,415 @@
+# python quickstart client Code Sample
+#
+# Copyright (c) Microsoft Corporation
+#
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+"""
+Create a pool of nodes to output text files from azure blob storage.
+"""
+
+import datetime
+import io
+import os
+import sys
+import time
+
+from azure.storage.blob import (
+    BlobServiceClient,
+    BlobSasPermissions,
+    generate_blob_sas
+)
+from azure.batch import BatchServiceClient
+from azure.batch.batch_auth import SharedKeyCredentials
+import azure.batch.models as batchmodels
+from azure.core.exceptions import ResourceExistsError
+
+import config
+
+DEFAULT_ENCODING = "utf-8"
+
+
+# Update the Batch and Storage account credential strings in config.py with values
+# unique to your accounts. These are used when constructing connection strings
+# for the Batch and Storage client objects.
+
+def query_yes_no(question: str, default: str = "yes") -> str:
+    """
+    Prompts the user for yes/no input, displaying the specified question text.
+
+    :param str question: The text of the prompt for input.
+    :param str default: The default if the user hits <ENTER>. Acceptable values
+    are 'yes', 'no', and None.
+    :return: 'yes' or 'no'
+    """
+    valid = {'y': 'yes', 'n': 'no'}
+    if default is None:
+        prompt = ' [y/n] '
+    elif default == 'yes':
+        prompt = ' [Y/n] '
+    elif default == 'no':
+        prompt = ' [y/N] '
+    else:
+        raise ValueError(f"Invalid default answer: '{default}'")
+
+    choice = default
+
+    while 1:
+        user_input = input(question + prompt).lower()
+        if not user_input:
+            break
+        try:
+            choice = valid[user_input[0]]
+            break
+        except (KeyError, IndexError):
+            print("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
+
+    return choice
+
+
+def print_batch_exception(batch_exception: batchmodels.BatchErrorException):
+    """
+    Prints the contents of the specified Batch exception.
+
+    :param batch_exception:
+    """
+    print('-------------------------------------------')
+    print('Exception encountered:')
+    if batch_exception.error and \
+            batch_exception.error.message and \
+            batch_exception.error.message.value:
+        print(batch_exception.error.message.value)
+        if batch_exception.error.values:
+            print()
+            for mesg in batch_exception.error.values:
+                print(f'{mesg.key}:\t{mesg.value}')
+    print('-------------------------------------------')
+
+
+def upload_file_to_container(blob_storage_service_client: BlobServiceClient,
+                             container_name: str, file_path: str) -> batchmodels.ResourceFile:
+    """
+    Uploads a local file to an Azure Blob storage container.
+
+    :param blob_storage_service_client: A blob service client.
+    :param str container_name: The name of the Azure Blob storage container.
+    :param str file_path: The local path to the file.
+    :return: A ResourceFile initialized with a SAS URL appropriate for Batch
+    tasks.
+    """
+    blob_name = os.path.basename(file_path)
+    blob_client = blob_storage_service_client.get_blob_client(container_name, blob_name)
+
+    print(f'Uploading file {file_path} to container [{container_name}]...')
+
+    with open(file_path, "rb") as data:
+        blob_client.upload_blob(data, overwrite=True)
+
+    sas_token = generate_blob_sas(
+        config.STORAGE_ACCOUNT_NAME,
+        container_name,
+        blob_name,
+        account_key=config.STORAGE_ACCOUNT_KEY,
+        permission=BlobSasPermissions(read=True),
+        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=2)
+    )
+
+    sas_url = generate_sas_url(
+        config.STORAGE_ACCOUNT_NAME,
+        config.STORAGE_ACCOUNT_DOMAIN,
+        container_name,
+        blob_name,
+        sas_token
+    )
+
+    return batchmodels.ResourceFile(
+        http_url=sas_url,
+        file_path=blob_name
+    )
+
+
+def generate_sas_url(
+    account_name: str,
+    account_domain: str,
+    container_name: str,
+    blob_name: str,
+    sas_token: str
+) -> str:
+    """
+    Generates and returns a sas url for accessing blob storage
+    """
+    return f"https://{account_name}.{account_domain}/{container_name}/{blob_name}?{sas_token}"
+
+
+def create_pool(batch_service_client: BatchServiceClient, pool_id: str):
+    """
+    Creates a pool of compute nodes with the specified OS settings.
+
+    :param batch_service_client: A Batch service client.
+    :param str pool_id: An ID for the new pool.
+    :param str publisher: Marketplace image publisher
+    :param str offer: Marketplace image offer
+    :param str sku: Marketplace image sku
+    """
+    print(f'Creating pool [{pool_id}]...')
+
+    # Create a new pool of Linux compute nodes using an Azure Virtual Machines
+    # Marketplace image. For more information about creating pools of Linux
+    # nodes, see:
+    # https://azure.microsoft.com/documentation/articles/batch-linux-nodes/
+    new_pool = batchmodels.PoolAddParameter(
+        id=pool_id,
+        virtual_machine_configuration=batchmodels.VirtualMachineConfiguration(
+            image_reference=batchmodels.ImageReference(
+                publisher="canonical",
+                offer="0001-com-ubuntu-server-focal",
+                sku="20_04-lts",
+                version="latest"
+            ),
+            node_agent_sku_id="batch.node.ubuntu 20.04"),
+        vm_size=config.POOL_VM_SIZE,
+        target_dedicated_nodes=config.POOL_NODE_COUNT
+    )
+    batch_service_client.pool.add(new_pool)
+
+
+def create_job(batch_service_client: BatchServiceClient, job_id: str, pool_id: str):
+    """
+    Creates a job with the specified ID, associated with the specified pool.
+
+    :param batch_service_client: A Batch service client.
+    :param str job_id: The ID for the job.
+    :param str pool_id: The ID for the pool.
+    """
+    print(f'Creating job [{job_id}]...')
+
+    job = batchmodels.JobAddParameter(
+        id=job_id,
+        pool_info=batchmodels.PoolInformation(pool_id=pool_id))
+
+    batch_service_client.job.add(job)
+
+
+def add_tasks(batch_service_client: BatchServiceClient, job_id: str, resource_input_files: list):
+    """
+    Adds a task for each input file in the collection to the specified job.
+
+    :param batch_service_client: A Batch service client.
+    :param str job_id: The ID of the job to which to add the tasks.
+    :param list resource_input_files: A collection of input files. One task will be
+     created for each input file.
+    """
+
+    print(f'Adding {resource_input_files} tasks to job [{job_id}]...')
+
+    tasks = []
+
+    for idx, input_file in enumerate(resource_input_files):
+
+        command = f"/bin/bash -c \"cat {input_file.file_path}\""
+        tasks.append(batchmodels.TaskAddParameter(
+            id=f'Task{idx}',
+            command_line=command,
+            resource_files=[input_file]
+        )
+        )
+
+    batch_service_client.task.add_collection(job_id, tasks)
+
+
+def wait_for_tasks_to_complete(batch_service_client: BatchServiceClient, job_id: str,
+                               timeout: datetime.timedelta):
+    """
+    Returns when all tasks in the specified job reach the Completed state.
+
+    :param batch_service_client: A Batch service client.
+    :param job_id: The id of the job whose tasks should be to monitored.
+    :param timeout: The duration to wait for task completion. If all
+    tasks in the specified job do not reach Completed state within this time
+    period, an exception will be raised.
+    """
+    timeout_expiration = datetime.datetime.now() + timeout
+
+    print(f"Monitoring all tasks for 'Completed' state, timeout in {timeout}...", end='')
+
+    while datetime.datetime.now() < timeout_expiration:
+        print('.', end='')
+        sys.stdout.flush()
+        tasks = batch_service_client.task.list(job_id)
+
+        incomplete_tasks = [task for task in tasks if
+                            task.state != batchmodels.TaskState.completed]
+        if not incomplete_tasks:
+            print()
+            return True
+
+        time.sleep(1)
+
+    print()
+    raise RuntimeError("ERROR: Tasks did not reach 'Completed' state within "
+                       "timeout period of " + str(timeout))
+
+
+def print_task_output(batch_service_client: BatchServiceClient, job_id: str,
+                      text_encoding: str=None):
+    """
+    Prints the stdout.txt file for each task in the job.
+
+    :param batch_client: The batch client to use.
+    :param str job_id: The id of the job with task output files to print.
+    """
+
+    print('Printing task output...')
+
+    tasks = batch_service_client.task.list(job_id)
+
+    for task in tasks:
+
+        node_id = batch_service_client.task.get(
+            job_id, task.id).node_info.node_id
+        print(f"Task: {task.id}")
+        print(f"Node: {node_id}")
+
+        stream = batch_service_client.file.get_from_task(
+            job_id, task.id, config.STANDARD_OUT_FILE_NAME)
+
+        file_text = _read_stream_as_string(
+            stream,
+            text_encoding)
+
+        if text_encoding is None:
+            text_encoding = DEFAULT_ENCODING
+
+        sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = text_encoding)
+        sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = text_encoding)
+
+        print("Standard output:")
+        print(file_text)
+
+
+def _read_stream_as_string(stream, encoding) -> str:
+    """
+    Read stream as string
+
+    :param stream: input stream generator
+    :param str encoding: The encoding of the file. The default is utf-8.
+    :return: The file content.
+    """
+    output = io.BytesIO()
+    try:
+        for data in stream:
+            output.write(data)
+        if encoding is None:
+            encoding = DEFAULT_ENCODING
+        return output.getvalue().decode(encoding)
+    finally:
+        output.close()
+
+
+if __name__ == '__main__':
+
+    start_time = datetime.datetime.now().replace(microsecond=0)
+    print(f'Sample start: {start_time}')
+    print()
+
+    # Create the blob client, for use in obtaining references to
+    # blob storage containers and uploading files to containers.
+    blob_service_client = BlobServiceClient(
+        account_url=f"https://{config.STORAGE_ACCOUNT_NAME}.{config.STORAGE_ACCOUNT_DOMAIN}/",
+        credential=config.STORAGE_ACCOUNT_KEY
+    )
+
+    # Use the blob client to create the containers in Azure Storage if they
+    # don't yet exist.
+    input_container_name = 'input'      # pylint: disable=invalid-name
+    try:
+        blob_service_client.create_container(input_container_name)
+    except ResourceExistsError:
+        pass
+
+    # The collection of data files that are to be processed by the tasks.
+    input_file_paths = [os.path.join(sys.path[0], 'taskdata0.txt'),
+                        os.path.join(sys.path[0], 'taskdata1.txt'),
+                        os.path.join(sys.path[0], 'taskdata2.txt')]
+
+    # Upload the data files.
+    input_files = [
+        upload_file_to_container(blob_service_client, input_container_name, file_path)
+        for file_path in input_file_paths]
+
+    # Create a Batch service client. We'll now be interacting with the Batch
+    # service in addition to Storage
+    credentials = SharedKeyCredentials(config.BATCH_ACCOUNT_NAME,
+        config.BATCH_ACCOUNT_KEY)
+
+    batch_client = BatchServiceClient(
+        credentials,
+        batch_url=config.BATCH_ACCOUNT_URL)
+
+    try:
+        # Create the pool that will contain the compute nodes that will execute the
+        # tasks.
+        create_pool(batch_client, config.POOL_ID)
+
+        # Create the job that will run the tasks.
+        create_job(batch_client, config.JOB_ID, config.POOL_ID)
+
+        # Add the tasks to the job.
+        add_tasks(batch_client, config.JOB_ID, input_files)
+
+        # Pause execution until tasks reach Completed state.
+        wait_for_tasks_to_complete(batch_client,
+                                   config.JOB_ID,
+                                   datetime.timedelta(minutes=30))
+
+        print("  Success! All tasks reached the 'Completed' state within the "
+              "specified timeout period.")
+
+        # Print the stdout.txt and stderr.txt files for each task to the console
+        print_task_output(batch_client, config.JOB_ID)
+
+        # Print out some timing info
+        end_time = datetime.datetime.now().replace(microsecond=0)
+        print()
+        print(f'Sample end: {end_time}')
+        elapsed_time = end_time - start_time
+        print(f'Elapsed time: {elapsed_time}')
+        print()
+        input('Press ENTER to exit...')
+
+    except batchmodels.BatchErrorException as err:
+        print_batch_exception(err)
+        raise
+
+    finally:
+      # Clean up storage resources
+        print(f'Deleting container [{input_container_name}]...')
+        blob_service_client.delete_container(input_container_name)
+
+        # Clean up Batch resources (if the user so chooses).
+        if query_yes_no('Delete job?') == 'yes':
+            batch_client.job.delete(config.JOB_ID)
+
+        if query_yes_no('Delete pool?') == 'yes':
+            batch_client.pool.delete(config.POOL_ID)
+ 

+ 218 - 0
data/purposeCombined/Azure/AddUp/table_advanced_samples.py

@@ -0,0 +1,218 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+import config
+import datetime
+import time
+from random_data import RandomData
+from tablestorageaccount import TableStorageAccount
+from azure.storage import CloudStorageAccount, AccessPolicy
+from azure.storage.table import TableService, Entity, TablePermissions
+from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
+
+#
+# Azure Table Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure Table Service
+# including creating a table, CRUD operations and different querying techniques.
+#
+# Documentation References:
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+#  - Getting Started with Tables - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-table-storage/
+#  - Table Service Concepts - http://msdn.microsoft.com/en-us/library/dd179463.aspx
+#  - Table Service REST API - http://msdn.microsoft.com/en-us/library/dd179423.aspx
+#  - Table Service Python API - http://azure.github.io/azure-storage-python/ref/azure.storage.table.html
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#
+class TableAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Table service.
+    def run_all_samples(self, account):
+        table_service = account.create_table_service()
+        print('Azure Storage Advanced Table samples - Starting.')
+        
+        print('\n\n* List tables *\n')
+        self.list_tables(table_service)
+        
+        if not account.is_azure_cosmosdb_table():
+           print('\n\n* Set service properties *\n')
+           self.set_service_properties(table_service)
+        
+           print('\n\n* Set Cors rules *\n')
+           self.set_cors_rules(table_service)
+        
+           print('\n\n* ACL operations *\n')
+           self.table_acl_operations(table_service)
+        
+        if (config.IS_EMULATED):
+            print('\n\n* Shared Access Signature is not supported in emulator *\n')
+        else:
+            print('\n\n* SAS operations *\n')
+            self.table_operations_with_sas(account)
+
+        print('\nAzure Storage Advanced Table samples - Completed.\n')
+
+    # Manage tables including creating, listing and deleting
+    def list_tables(self, table_service):
+        table_prefix = 'table' + self.random_data.get_random_name(6)
+
+        try:        
+            # Create tables
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                print('1. Create a table with name - ' + table_name)
+                table_service.create_table(table_name)
+            
+            # List all the tables 
+            print('2. List tables')
+            tables = table_service.list_tables()
+            for table in tables:
+                print('\Table Name: ' + table.name)
+
+        finally:
+            # Delete the tables
+            print("3. Delete Tables")
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                if(table_service.exists(table_name)):
+                    table_service.delete_table(table_name)
+            
+        print("List tables sample completed")
+    
+    # Manage properties of the Table service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, table_service):
+        print('1. Get Table service properties')
+        props = table_service.get_table_service_properties()
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        logging = Logging(delete=True, read=False, write=True, retention_policy=retention)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite Table service properties')
+            table_service.set_table_service_properties(logging=logging, hour_metrics=hour_metrics, minute_metrics=minute_metrics)
+
+        finally:
+            print('3. Revert Table service properties back to the original ones')
+            table_service.set_table_service_properties(logging=props.logging, hour_metrics=props.hour_metrics, minute_metrics=props.minute_metrics)
+
+        print('4. Set Table service properties completed')
+    
+    # Manage CORS rules on the table service
+    def set_cors_rules(self, table_service):
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+        
+        print('1. Get Cors Rules')
+        original_cors_rules = table_service.get_table_service_properties().cors
+
+        try:        
+            print('2. Overwrite Cors Rules')
+            table_service.set_table_service_properties(cors=[cors_rule])
+
+        finally:
+            #reverting cors rules back to the original ones
+            print('3. Revert Cors Rules back the original ones')
+            table_service.set_table_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+
+    # Manage table access policy
+    def table_acl_operations(self, table_service):
+        table_name = 'acltable' + self.random_data.get_random_name(6)
+
+        try:        
+            print('1. Create a table with name - ' + table_name)
+            table_service.create_table(table_name)
+                
+            print('2. Set access policy for table')
+            access_policy = AccessPolicy(permission=TablePermissions.QUERY,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            table_service.set_table_acl(table_name, identifiers)
+
+            print('3. Wait 30 seconds for acl to propagate')
+            time.sleep(30)
+
+            print('4. Get access policy from table')
+            acl = table_service.get_table_acl(table_name)
+
+            print('5. Clear access policy in table')
+            table_service.set_table_acl(table_name)
+
+        finally:
+            print('5. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table ACL operations sample completed")
+    
+    # Manage shared access signature on a table
+    def table_operations_with_sas(self, account):
+        table_name = 'sastable' + self.random_data.get_random_name(6)
+        
+        try:
+            # Create a Table Service object
+            table_service = account.create_table_service()
+            
+            print('1. Create table with name - ' + table_name)
+            table_service.create_table(table_name)
+            
+            # Create a Shared Access Signature for the table
+            print('2. Get sas for table')
+            
+            table_sas = table_service.generate_table_shared_access_signature(
+                table_name, 
+                TablePermissions.QUERY + TablePermissions.ADD + TablePermissions.UPDATE + TablePermissions.DELETE, 
+                datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+
+            shared_account = TableStorageAccount(account_name=account.account_name, sas_token=table_sas, endpoint_suffix=account.endpoint_suffix)
+            shared_table_service = shared_account.create_table_service()
+
+            # Create a sample entity to insert into the table
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '555-555-5555'}
+
+            # Insert the entity into the table
+            print('3. Insert new entity into table with sas - ' + table_name)
+            shared_table_service.insert_entity(table_name, customer)
+            
+            # Demonstrate how to query the entity
+            print('4. Read the inserted entity with sas.')
+            entity = shared_table_service.get_entity(table_name, 'Harp', '1')
+            
+            print(entity['email'])
+            print(entity['phone'])
+
+            # Demonstrate how to update the entity by changing the phone number
+            print('5. Update an existing entity by changing the phone number with sas')
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '425-123-1234'}
+            shared_table_service.update_entity(table_name, customer)
+
+            # Demonstrate how to delete an entity
+            print('6. Delete the entity with sas')
+            shared_table_service.delete_entity(table_name, 'Harp', '1')
+
+        finally:
+            print('7. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table operations with sas completed")

+ 96 - 0
data/purposeCombined/Azure/AddUp/table_basic_samples.py

@@ -0,0 +1,96 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+import config
+from random_data import RandomData
+from azure.storage import CloudStorageAccount
+from azure.storage.table import TableService, Entity
+
+#
+# Azure Table Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure Table Service
+# including creating a table, CRUD operations and different querying techniques.
+#
+# Documentation References:
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+#  - Getting Started with Tables - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-table-storage/
+#  - Table Service Concepts - http://msdn.microsoft.com/en-us/library/dd179463.aspx
+#  - Table Service REST API - http://msdn.microsoft.com/en-us/library/dd179423.aspx
+#  - Table Service Python API - http://azure.github.io/azure-storage-ruby/
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#
+class TableBasicSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Table service.
+    def run_all_samples(self, account):
+        print('Azure Storage Basic Table samples - Starting.')
+        table_name = 'tablebasics' + self.random_data.get_random_name(6)
+        table_service = None
+        try:
+            table_service = account.create_table_service()
+
+            # Create a new table
+            print('Create a table with name - ' + table_name)
+
+            try:
+                table_service.create_table(table_name)
+            except Exception as err:
+                print('Error creating table, ' + table_name + 'check if it already exists')
+ 
+            # Create a sample entity to insert into the table
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '555-555-5555'}
+
+            # Insert the entity into the table
+            print('Inserting a new entity into table - ' + table_name)
+            table_service.insert_entity(table_name, customer)
+            print('Successfully inserted the new entity')
+
+            # Demonstrate how to query the entity
+            print('Read the inserted entity.')
+            entity = table_service.get_entity(table_name, 'Harp', '1')
+            print(entity['email'])
+            print(entity['phone'])
+
+            # Demonstrate how to update the entity by changing the phone number
+            print('Update an existing entity by changing the phone number')
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '425-123-1234'}
+            table_service.update_entity(table_name, customer)
+
+            # Demonstrate how to query the updated entity, filter the results with a filter query and select only the value in the phone column
+            print('Read the updated entity with a filter query')
+            entities = table_service.query_entities(table_name, filter="PartitionKey eq 'Harp'", select='phone')
+            for entity in entities:
+                print(entity['phone'])
+
+            # Demonstrate how to delete an entity
+            print('Delete the entity')
+            table_service.delete_entity(table_name, 'Harp', '1')
+            print('Successfully deleted the entity')
+
+        except Exception as e:
+            if (config.IS_EMULATED):
+                print('Error occurred in the sample. If you are using the emulator, please make sure the emulator is running.', e)
+            else: 
+                print('Error occurred in the sample. Please make sure the account name and key are correct.', e)
+        finally:
+            # Demonstrate deleting the table, if you don't want to have the table deleted comment the below block of code
+            print('Deleting the table.')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            print('Successfully deleted the table')
+
+        print('\nAzure Storage Basic Table samples - Completed.\n')

+ 1 - 0
data/purposeCombined/Azure/AzureStorage

@@ -0,0 +1 @@
+Subproject commit ac4dbd83e307a5b8d3fd3b77103ec837b821c564

+ 125 - 0
data/purposeCombined/Azure/DLfile.py

@@ -0,0 +1,125 @@
+from azure.datalake.store import core, lib
+import config
+
+import sys, io
+import schedule, threading, time
+
+from datetime import datetime
+
+from os import listdir
+from os.path import isfile, join
+
+
+import glob
+
+
+def run_once_threaded(job_func):
+    job_thread = threading.Thread(target=job_func)
+    job_thread.start()
+    return schedule.CancelJob
+
+def run_threaded(job_func):
+    job_thread = threading.Thread(target=job_func)
+    job_thread.start()
+    
+
+local_upload_folder_path = "LOCAL_FOLDER_PATH"
+adls_upload_folder_path = "ADLS_FOLDER_PATH"
+
+
+orginal_stdout = sys.stdout
+
+buf = io.StringIO()
+sys.stdout = buf
+adlCreds = -1
+
+uploaded_files = False
+
+def postToTeams():
+ output = buf.getvalue()
+ if output == "":
+  return
+ orginal_stdout.write(output)
+
+  
+ now = datetime.now()
+ current_time = now.strftime("%H:%M:%S")
+ 
+ config.sendToTeams("{}<br>{}".format(current_time, output))
+ 
+ buf.truncate(0)
+ buf.seek(0)
+ 
+def authenticate():
+ global adlCreds
+ adlCreds = lib.auth(config.azure_tenant_id)
+
+
+def authenticated():
+ if adlCreds ==  -1:
+  return
+  
+#  print("Authentication sucess!")
+  
+ run_once_threaded(upload_files)
+ 
+ return schedule.CancelJob
+
+ 
+def upload_files():
+ adl = core.AzureDLFileSystem(adlCreds, store_name=config.store_name)
+ uploadedFolders = adl.ls(adls_upload_folder_path)
+ 
+ uploadedFolders = set([folder.replace(adls_upload_folder_path[1:], "")+"/" for folder in uploadedFolders])
+ 
+ local_folders = glob.glob(local_upload_folder_path+"*") # * means all if need specific format then *.csv
+ local_folders = set([d.replace(local_upload_folder_path, "")+"/" for d in local_folders])
+
+ to_upload_folders = local_folders.difference(uploadedFolders)
+
+ folder_names = sorted([d.replace(local_upload_folder_path, "") for d in to_upload_folders])
+
+ files = []
+ for folder in folder_names:
+  path = local_upload_folder_path+folder
+  for f in listdir(path):
+   if isfile(join(path, f)):
+    files.append(folder+f)
+
+
+ print("Uploading the following folders:<br>{}<br>Total number of files to upload:<br>{}".format(", ". join(folder_names), len(files)))
+ 
+
+ for f in files:
+  adl.put(local_upload_folder_path+f, adls_upload_folder_path+f)
+    
+
+ print("Upload finished.")
+ time.sleep(2)
+ global uploaded_files
+ uploaded_files = True
+
+
+def exit_program():
+ if uploaded_files == True:
+  exit()
+
+schedule.every(2).seconds.do(run_threaded, postToTeams)
+schedule.every().seconds.do(run_once_threaded, authenticate)
+schedule.every().seconds.do(authenticated)
+schedule.every().seconds.do(exit_program)
+
+
+while 1:
+    schedule.run_pending()
+    time.sleep(1) 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 

+ 1 - 0
data/purposeCombined/Azure/azure-multiapi-storage-python

@@ -0,0 +1 @@
+Subproject commit dc0e7dc1066ca4cd2d6006a5bccd7ec37521ec1c

+ 64 - 0
data/purposeCombined/Azure/blob-adapter.py

@@ -0,0 +1,64 @@
+import configparser
+from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
+from azure.core.exceptions import HttpResponseError, ResourceExistsError
+from flask import jsonify
+
+class AzureBlobAdapter:
+    FILE_PREFIX = 'IN_CARE'
+    blob_service_client: BlobServiceClient
+    blob_client: BlobClient
+    container_client: ContainerClient
+    configs = configparser.ConfigParser()
+    configs.read('azure_blob.cfg')
+
+    # init method or constructor
+
+    def __init__(self):
+        connection_string = self.get_config('connection_string')
+        print("Azure Blob Storage v" + __version__ +
+              " - Blob Python libs")
+        self.blob_service_client = BlobServiceClient.from_connection_string(
+            connection_string)
+
+    def upload(self, file_dict):
+        upload_response = {}
+        for key in file_dict:
+            print("File Dict Key: [{}] value is: {}".format(key, file_dict[key]))
+            print("\nUploading to Azure Storage as blob:\n\t" + key)
+
+            self.blob_client = self.blob_service_client.get_blob_client(container=self.get_config('container_name'), blob=key)
+            with open(file_dict[key], "rb") as data:
+                try:
+                    self.blob_client.upload_blob(data)
+                    print('File: Uploaded Successfully: {}'.format(key))
+                    upload_response[key] = 'Successfully Uploaded'
+                except ResourceExistsError:
+                    print('File: NOT Uploaded Successfully: {}'.format(key))
+                    upload_response[key] = 'This Resource already exists'
+                    upload_response['Partial'] = True
+                    print('This Resource already exists')
+                    # return 'This Resource already exists'
+        print("Before Returning Response:")
+        print(jsonify(upload_response))
+        print("---------------")
+        return upload_response
+
+    def get_blob_client(self, blob_name):
+        self.blob_client = self.blob_service_client.get_blob_client(
+            container=self.get_config('container_name'), blob=blob_name)
+        return self.blob_client
+
+    def list_blobs(self):
+        print("\nList blobs in the container")
+        self.container_client = self.blob_service_client.get_container_client(
+            container=self.get_config('container_name'))
+        blob_list = self.container_client.list_blobs()
+        blobs = []
+        for blob in blob_list:
+            # print("\t Blob name: " + blob.name)
+            blobs.append(blob.name)
+        return blobs
+
+    def get_config(self, app_property):
+        config_value = self.configs['azure_blob_config'][app_property]
+        return config_value

+ 98 - 0
data/purposeCombined/Azure/blob-permission.py

@@ -0,0 +1,98 @@
+from datetime import datetime, timedelta
+
+from azure.storage.blob import BlobSasPermissions, generate_blob_sas
+
+from azurebatchload.checks import Checks
+
+
+class Base(Checks):
+    def __init__(
+        self,
+        destination,
+        folder,
+        extension=None,
+        modified_since=None,
+        method="batch",
+        list_files=None,
+        expiry_download_links=7,
+    ):
+        super().__init__(directory=folder)
+
+        self.destination = destination
+        self.folder = folder
+        self.extension = extension
+        self.modified_since = modified_since
+        if not self._check_azure_cli_installed():
+            self.method = "single"
+        else:
+            self.method = method
+        self.list_files = list_files
+        credentials = self._check_connection_credentials()
+        self.connection_string = credentials[0]
+        self.account_name = credentials[1]
+        self.account_key = credentials[2]
+        self.expiry_download_links = expiry_download_links
+
+    def checks(self):
+        allowed_methods = ("batch", "single")
+        if self.method not in allowed_methods:
+            raise ValueError(f"Method {self.method} is not a valid method. Choose from {' or '.join(allowed_methods)}.")
+
+        if self.list_files and self.method == "batch":
+            raise ValueError("list_files is only allowed with method='single'.")
+
+        if self.list_files and not isinstance(self.list_files, list):
+            raise ValueError(f"Argument list_files was set, but is not of type list, but type {type(self.list_files)}")
+
+    def create_blob_link(self, blob_folder, blob_name) -> str:
+        if blob_folder:
+            full_path_blob = f"{blob_folder}/{blob_name}"
+        else:
+            full_path_blob = blob_name
+        url = f"https://{self.account_name}.blob.core.windows.net/{self.destination}/{full_path_blob}"
+        sas_token = generate_blob_sas(
+            account_name=self.account_name,
+            account_key=self.account_key,
+            container_name=self.destination,
+            blob_name=full_path_blob,
+            permission=BlobSasPermissions(read=True, delete_previous_version=False),
+            expiry=datetime.utcnow() + timedelta(days=self.expiry_download_links),
+        )
+
+        url_with_sas = f"{url}?{sas_token}"
+        return url_with_sas
+
+    @staticmethod
+    def create_not_case_sensitive_extension(extension):
+        """
+        We create in-case sensitive fnmatch
+        .pdf -> .[Pp][Dd][Ff]
+        .csv -> .[Cc][Ss][Vv]
+        """
+        new_extension = ""
+        for letter in extension:
+            if not letter.isalpha():
+                new_extension += letter
+            else:
+                new_extension += f"[{letter.upper()}{letter}]"
+
+        if not new_extension.startswith("*"):
+            new_extension = "*" + new_extension
+
+        return new_extension
+
+    def define_pattern(self):
+        self.extension = self.create_not_case_sensitive_extension(self.extension)
+        if self.folder and not self.extension:
+            if self.folder.endswith("/"):
+                pattern = self.folder + "*"
+            else:
+                pattern = self.folder + "/*"
+        elif self.folder and self.extension:
+            pattern = self.folder.rstrip("/") + "/" + "*" + self.extension
+        elif not self.folder and self.extension:
+            pattern = "*" + self.extension
+        else:
+            pattern = None
+
+        return pattern

+ 101 - 0
data/purposeCombined/Azure/blob-upload-1.py

@@ -0,0 +1,101 @@
+import logging
+import os
+
+from azure.storage.blob import BlobServiceClient
+
+from azurebatchload.core import Base
+
+
+class Upload(Base):
+    def __init__(
+        self,
+        destination,
+        source,
+        folder=None,
+        extension=None,
+        method="batch",
+        modified_since=None,
+        overwrite=False,
+        list_files=None,
+        create_download_links=False,
+        expiry_download_links=7,
+    ):
+        super(Upload, self).__init__(
+            destination=destination,
+            folder=source,
+            extension=extension,
+            modified_since=modified_since,
+            method=method,
+            list_files=list_files,
+            expiry_download_links=expiry_download_links,
+        )
+        self.blob_folder = folder
+        self.overwrite = overwrite
+        self.create_download_links = create_download_links
+
+    def upload_batch(self):
+        cmd = f"az storage fs directory upload " f"-f {self.destination} " f"-s {self.folder} -r"
+
+        non_default = {"-d": self.blob_folder, "--connection-string": self.connection_string}
+
+        for flag, value in non_default.items():
+            if value:
+                cmd = f"{cmd} {flag} '{value}'"
+
+        os.system(cmd)
+
+    def upload_single(self):
+        blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
+        download_links = {}
+
+        for root, dirs, files in os.walk(self.folder):
+            for file in files:
+
+                full_path = os.path.join(root, file)
+
+                # ignore hidden files
+                if file.startswith("."):
+                    continue
+
+                # if list_files is given, only upload matched files
+                if self.list_files and file not in self.list_files:
+                    continue
+
+                # if extension is given only upload if extension is matched
+                if self.extension and os.path.isfile(full_path) and not file.lower().endswith(self.extension.lower()):
+                    continue
+
+                blob_folder = root.replace(self.folder, "").lstrip("/")
+
+                if self.blob_folder:
+                    # we only want to append blob_folder if it actually is a path or folder
+                    # blob_folder can be empty string ""
+                    if blob_folder:
+                        blob_folder = os.path.join(self.blob_folder, blob_folder)
+                    else:
+                        blob_folder = self.blob_folder
+
+                # if no folder is given, just upload to the container root path
+                if not blob_folder:
+                    container = self.destination
+                else:
+                    container = os.path.join(self.destination, blob_folder)
+                container_client = blob_service_client.get_container_client(container=container)
+
+                with open(full_path, "rb") as data:
+                    logging.debug(f"Uploading blob {full_path}")
+                    container_client.upload_blob(data=data, name=file, overwrite=self.overwrite)
+
+                if self.create_download_links:
+                    download_links[file] = self.create_blob_link(blob_folder=blob_folder, blob_name=file)
+
+        return download_links
+
+    def upload(self):
+        self.checks()
+
+        logging.info(f"Uploading to container {self.destination} with method = '{self.method}'.")
+        if self.method == "batch":
+            return self.upload_batch()
+        else:
+            return self.upload_single()

+ 81 - 0
data/purposeCombined/Azure/blob-upload-2.py

@@ -0,0 +1,81 @@
+import requests
+from bs4 import BeautifulSoup as bs
+import os
+from azure.storage.blob import BlobServiceClient, BlobClient
+from azure.storage.blob import ContentSettings, ContainerClient
+
+#Your Connexion String
+MY_CONNECTION_STRING = "DefaultEndpointsProtocol************************"
+#Your Container Name
+MY_IMAGE_CONTAINER = "picture"
+#Your local path
+LOCAL_IMAGE_PATH = "..\Picture"
+#change the url to the one you want to scrape
+URL = 'WebSiteURL'
+
+class AzureBlobStorage:
+    def Scrapp(self):
+        #create folder with the picture if it doesn't exist
+        if not os.path.exists('.\Picture'):
+            os.mkdir('.\Picture')
+        os.chdir('.\Picture')
+        #Change the number to begin where you want to start
+        page_begin = 1
+        #Change the number to the number of pages you want to scrape
+        page_end = 230 + 1
+
+        #If you want to scrape only one page, change the page_end to page_begin or delete the loop
+        for page in range(page_begin, page_end):
+            req = requests.get(URL + str(page))
+            soup = bs(req.text, 'html.parser')
+            images = soup.find_all('img')
+            for images in images:
+                name = images['src']
+                alpha = images['src']
+                link = 'WebSiteURL' + alpha
+                print(link)
+                #replace the name of the photo it's better :))
+                with open(name.replace(' ', '-').replace('/', '').replace('"', "'").replace('.jpg','') + '.jpg','wb') as f:
+                    im = requests.get(link)
+                    f.write(im.content)
+                    #check the name on the terminal
+                    print('Writing: ', name)
+
+    def __init__(self):
+        # Initialize the connection to Azure storage account
+        self.blob_service_client = BlobServiceClient.from_connection_string(MY_CONNECTION_STRING)
+
+    def upload_all_images_in_folder(self):
+        # Get all files with jpg extension and exclude directories
+        all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
+                          if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
+        # Upload each file
+        for file_name in all_file_names:
+            self.upload_image(file_name)
+
+    def upload_image(self, file_name):
+        # Create blob with same name as local file name
+        blob_client = self.blob_service_client.get_blob_client(container=MY_IMAGE_CONTAINER,
+                                                               blob=file_name)
+        # Get full path to the file
+        upload_file_path = os.path.join(LOCAL_IMAGE_PATH, file_name)
+        # Create blob on storage
+        # Overwrite if it already exists!
+        image_content_setting = ContentSettings(content_type='image/jpeg')
+        print(f"uploading file - {file_name}")
+        with open(upload_file_path, "rb") as data:
+            blob_client.upload_blob(data, overwrite=True, content_settings=image_content_setting)
+
+    def upload_all_images_in_folder(self):
+        # Get all files with jpg extension and exclude directories
+        all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
+                          if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
+        # Upload each file
+        for file_name in all_file_names:
+            self.upload_image(file_name)
+if __name__=='__main__':
+        
+    # Initialize class and upload files
+    azure_blob_file_uploader = AzureBlobStorage()
+    azure_blob_file_uploader.Scrapp()
+    azure_blob_file_uploader.upload_all_images_in_folder()

+ 57 - 0
data/purposeCombined/Azure/blob-upload-3.py

@@ -0,0 +1,57 @@
+from flask import Flask
+from flask import jsonify
+from flask import request
+from werkzeug import secure_filename
+from azure.storage.blob import BlockBlobService
+import os
+
+
+app = Flask(__name__, static_folder='static', static_url_path='')
+
+app.config['ALLOWED_EXTENSIONS'] = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
+app.config['MAX_CONTENT_LENGTH'] = 1 * 1024 * 1024    # 1 Mb limit
+app.config['AZURE_STORAGE_ACCOUNT'] = "flasktest"
+app.config['AZURE_STORAGE_CONTAINER'] = "doc"
+app.config['AZURE_STORAGE_KEY'] = os.environ['AZURE_STORAGE_KEY']
+try:
+    os.environ['FLASK_DEBUG']
+    app.debug = True
+except KeyError:
+    app.debug = False
+
+
+def allowed_file(filename):
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
+
+@app.route('/')
+def root():
+    return app.send_static_file('index.html')
+
+
+# basedir = os.path.abspath(os.path.dirname(__file__))
+
+@app.route('/uploadajax', methods=['POST'])
+def upldfile():
+    if request.method == 'POST':
+        file = request.files['file']
+        if file and allowed_file(file.filename):
+            filename = secure_filename(file.filename)
+            app.logger.info('FileName: ' + filename)
+            
+            block_blob_service = BlockBlobService(account_name=app.config['AZURE_STORAGE_ACCOUNT'], account_key=app.config['AZURE_STORAGE_KEY'])
+            block_blob_service.create_blob_from_bytes(
+                'doc',
+                filename,
+                file.read())
+            
+#             updir = os.path.join(basedir, 'upload/')
+#             file.save(os.path.join(updir, filename))
+#             file_size = os.path.getsize(os.path.join(updir, filename))
+            return jsonify(name=filename, url='https://'+app.config['AZURE_STORAGE_ACCOUNT']+'.blob.core.windows.net/' \
+                           +app.config['AZURE_STORAGE_CONTAINER']+'/'+filename)
+
+
+
+if __name__ == '__main__':
+ app.run()

+ 67 - 0
data/purposeCombined/Azure/blob-upload-4.py

@@ -0,0 +1,67 @@
+import os, uuid
+from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
+import argparse
+from argparse import ArgumentParser
+
+parser = ArgumentParser()
+parser.add_argument('--connect_str', default='', type=str)
+parser.add_argument('--container_name', default='', type=str)
+parser.add_argument('--source', default='', type=str)
+parser.add_argument('--target', default='', type=str)
+parser.add_argument('--is_directory', default=False, action='store_true')
+parser.add_argument('--download', default=False, action='store_true')
+parser.add_argument('--upload', default=False, action='store_true')
+arg = parser.parse_args()
+
+connect_str = arg.connect_str #Enter your connection string here! Refer to https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=environment-variable-windows for more info
+container_name = arg.container_name #Enter your continaer name from azure blob storage here!
+blob_service_client = BlobServiceClient.from_connection_string(connect_str) # Create the BlobServiceClient object which will be used to create a container client
+
+def upload_file_to_blob(upload_file_path, target): #file path - >file path
+    blob_client = blob_service_client.get_blob_client(container=container_name, blob=target)
+    print("\nUploading to Azure Storage as blob:\n\t" + upload_file_path)
+    with open(upload_file_path, "rb") as data:
+        blob_client.upload_blob(data)
+
+def upload_directory_to_blob(upload_file_path, target): #directory name -> directory name
+    print("\nUploading directory to Azure Storage as blob:\n\t" + upload_file_path)
+    files = os.listdir(upload_file_path)
+    for dir in files:
+        file_name = upload_file_path + '/' + dir
+        target_ = target+ '/' + dir
+        blob_client = blob_service_client.get_blob_client(container=container_name, blob=target_)
+        with open(file_name, "rb") as data:
+            blob_client.upload_blob(data)
+
+def download_file_from_blob(source, download_file_path):
+    blob_client = blob_service_client.get_blob_client(container=container_name, blob=source)
+    print("\nDownloading blob to \n\t from container" + download_file_path)
+
+    with open(download_file_path, "wb") as download_file:
+        download_file.write(blob_client.download_blob().readall())
+
+def download_directory_from_blob(source, download_directory_path):
+    container_client = ContainerClient.from_connection_string(conn_str=connect_str, container_name=container_name)
+    print(f"\nDownloading all blobs from the following directory {source} in container {container_name}")
+    blob_list = container_client.list_blobs()
+    for blob in blob_list:
+        if source in blob.name:
+            blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob.name)
+            os.makedirs(os.path.dirname(blob.name), exist_ok=True)
+            with open(blob.name, "wb") as download_file:
+                download_file.write(blob_client.download_blob().readall())
+
+
+if not arg.download and not arg.upload:
+    raise Exception('Specificy either --upload or --download. Specify only one.')
+
+if arg.download: #downloading from source to target
+    if not arg.is_directory:
+        download_file_from_blob(arg.source, arg.target)
+    else:
+        download_directory_from_blob(arg.source, arg.target)
+else: #Uploading source to target
+    if not arg.is_directory:
+        upload_file_to_blob(arg.source, arg.target)
+    else:
+        upload_directory_to_blob(arg.source, arg.target)

+ 107 - 0
data/purposeCombined/Azure/blob-upload.py

@@ -0,0 +1,107 @@
+# ----------------------------------------------------------------------------------
+# MIT License
+#
+# Copyright(c) Microsoft Corporation. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# ----------------------------------------------------------------------------------
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+
+import os
+import uuid
+import sys
+from azure.storage.blob import BlockBlobService, PublicAccess
+
+# ---------------------------------------------------------------------------------------------------------
+# Method that creates a test file in the 'Sample' folder.
+# This sample application creates a test file, uploads the test file to the Blob storage,
+# lists the blobs in the container, and downloads the file with a new name.
+# ---------------------------------------------------------------------------------------------------------
+# Documentation References:
+# Associated Article - https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python
+# What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+# Getting Started with Blobs-https://docs.microsoft.com/en-us/azure/storage/blobs/storage-python-how-to-use-blob-storage
+# Blob Service Concepts - http://msdn.microsoft.com/en-us/library/dd179376.aspx
+# Blob Service REST API - http://msdn.microsoft.com/en-us/library/dd135733.aspx
+# ----------------------------------------------------------------------------------------------------------
+
+
+def run_sample():
+    try:
+        # Create the BlockBlobService that is used to call the Blob service for the storage account
+        blob_service_client = BlockBlobService(
+            account_name='accountname', account_key='accountkey')
+
+        # Create a container called 'quickstartblobs'.
+        container_name = 'quickstartblobs'
+        blob_service_client.create_container(container_name)
+
+        # Set the permission so the blobs are public.
+        blob_service_client.set_container_acl(
+            container_name, public_access=PublicAccess.Container)
+
+        # Create Sample folder if it not exists, and create a file in folder Sample to test the upload and download.
+        local_path = os.path.expanduser("~/Sample")
+        if not os.path.exists(local_path):
+            os.makedirs(os.path.expanduser("~/Sample"))
+        local_file_name = "QuickStart_" + str(uuid.uuid4()) + ".txt"
+        full_path_to_file = os.path.join(local_path, local_file_name)
+
+        # Write text to the file.
+        file = open(full_path_to_file,  'w')
+        file.write("Hello, World!")
+        file.close()
+
+        print("Temp file = " + full_path_to_file)
+        print("\nUploading to Blob storage as blob" + local_file_name)
+
+        # Upload the created file, use local_file_name for the blob name
+        blob_service_client.create_blob_from_path(
+            container_name, local_file_name, full_path_to_file)
+
+        # List the blobs in the container
+        print("\nList blobs in the container")
+        generator = blob_service_client.list_blobs(container_name)
+        for blob in generator:
+            print("\t Blob name: " + blob.name)
+
+        # Download the blob(s).
+        # Add '_DOWNLOADED' as prefix to '.txt' so you can see both files in Documents.
+        full_path_to_file2 = os.path.join(local_path, str.replace(
+            local_file_name ,'.txt', '_DOWNLOADED.txt'))
+        print("\nDownloading blob to " + full_path_to_file2)
+        blob_service_client.get_blob_to_path(
+            container_name, local_file_name, full_path_to_file2)
+
+        sys.stdout.write("Sample finished running. When you hit <any key>, the sample will be deleted and the sample "
+                         "application will exit.")
+        sys.stdout.flush()
+        input()
+
+        # Clean up resources. This includes the container and the temp files
+        blob_service_client.delete_container(container_name)
+        os.remove(full_path_to_file)
+        os.remove(full_path_to_file2)
+    except Exception as e:
+        print(e)
+
+
+# Main method.
+if __name__ == '__main__':
+    run_sample()

+ 221 - 0
data/purposeCombined/Azure/django-blob.py

@@ -0,0 +1,221 @@
+import mimetypes
+import datetime
+
+from azure.common import AzureMissingResourceHttpError
+from azure.storage.blob import BlobService
+
+from django.core.files.storage import Storage
+from django.conf import settings
+
+try:
+    from django.utils.deconstruct import deconstructible
+except ImportError:
+    # Support for django 1.7 and below
+    def deconstructible(func):
+        return func
+
+
+@deconstructible
+class AzureStorage(Storage):
+    """
+    Custom file storage system for Azure
+    """
+
+    container = settings.AZURE_STORAGE.get('CONTAINER')
+    account_name = settings.AZURE_STORAGE.get('ACCOUNT_NAME')
+    account_key = settings.AZURE_STORAGE.get('ACCOUNT_KEY')
+    cdn_host = settings.AZURE_STORAGE.get('CDN_HOST')
+    use_ssl = settings.AZURE_STORAGE.get('USE_SSL')
+
+    def __init__(self, account_name=None, account_key=None, container=None,
+         use_ssl=None, cdn_host=None):
+
+        if account_name is not None:
+            self.account_name = account_name
+
+        if account_key is not None:
+            self.account_key = account_key
+
+        if container is not None:
+            self.container = container
+
+        if use_ssl is not None:
+            self.use_ssl = use_ssl
+
+        if cdn_host is not None:
+            self.cdn_host = cdn_host
+
+    def __getstate__(self):
+        return dict(
+            account_name=self.account_name,
+            account_key=self.account_key,
+            container=self.container,
+            cdn_host=self.cdn_host,
+            use_ssl=self.use_ssl
+        )
+
+    def _get_service(self):
+        if not hasattr(self, '_blob_service'):
+            self._blob_service = BlobService(
+                account_name=self.account_name,
+                account_key=self.account_key,
+                protocol='https' if self.use_ssl else 'http'
+            )
+
+        return self._blob_service
+
+    def _get_properties(self, name):
+        return self._get_service().get_blob_properties(
+            container_name=self.container,
+            blob_name=name
+        )
+
+    def _open(self, name, mode='rb'):
+        """
+        Return the AzureStorageFile.
+        """
+
+        from django.core.files.base import ContentFile
+
+        contents = self._get_service().get_blob_to_bytes(
+            container_name=self.container,
+            blob_name=name
+        )
+
+        return ContentFile(contents)
+
+    def _save(self, name, content):
+        """
+        Use the Azure Storage service to write ``content`` to a remote file
+        (called ``name``).
+        """
+        
+
+        content.open()
+
+        content_type = None
+
+        if hasattr(content.file, 'content_type'):
+            content_type = content.file.content_type
+        else:
+            content_type = mimetypes.guess_type(name)[0]
+
+        cache_control = self.get_cache_control(
+            self.container,
+            name,
+            content_type
+        )
+
+        self._get_service().put_block_blob_from_file(
+            container_name=self.container,
+            blob_name=name,
+            stream=content,
+            x_ms_blob_content_type=content_type,
+            cache_control=cache_control,
+            x_ms_blob_cache_control=cache_control
+        )
+
+        content.close()
+
+        return name
+
+    def listdir(self, path):
+        """
+        Lists the contents of the specified path, returning a 2-tuple of lists;
+        the first item being directories, the second item being files.
+        """
+
+        files = []
+
+        if path and not path.endswith('/'):
+            path = '%s/' % path
+
+        path_len = len(path)
+
+        if not path:
+            path = None
+
+        blob_list = self._get_service().list_blobs(self.container, prefix=path)
+
+        for name in blob_list:
+            files.append(name[path_len:])
+
+        return ([], files)
+
+    def exists(self, name):
+        """
+        Returns True if a file referenced by the given name already exists in
+        the storage system, or False if the name is available for a new file.
+        """
+        try:
+            self._get_properties(name)
+
+            return True
+        except AzureMissingResourceHttpError:
+            return False
+
+    def delete(self, name):
+        """
+        Deletes the file referenced by name.
+        """
+
+        try:
+            self._get_service().delete_blob(self.container, name)
+        except AzureMissingResourceHttpError:
+            pass
+
+    def get_cache_control(self, container, name, content_type):
+        """
+        Get the Cache-Control value for a blob, used when saving the blob on
+        Azure.  Returns `None` by default to remain compatible with the
+        default setting for the SDK.
+        """
+
+        return None
+
+    def size(self, name):
+        """
+        Returns the total size, in bytes, of the file referenced by name.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return int(properties['content-length'])
+        except AzureMissingResourceHttpError:
+            pass
+
+    def url(self, name):
+        """
+        Returns the URL where the contents of the file referenced by name can
+        be accessed.
+        """
+
+        blob_url_args = {
+            'container_name': self.container,
+            'blob_name': name,
+        }
+
+        if self.cdn_host:
+            # The account name should be built into the cdn hostname
+            blob_url_args['account_name'] = ''
+            blob_url_args['host_base'] = self.cdn_host
+
+        return self._get_service().make_blob_url(
+            **blob_url_args
+        )
+
+    def modified_time(self, name):
+        """
+        Returns a datetime object containing the last modified time.
+        """
+
+        try:
+            properties = self._get_properties(name)
+
+            return datetime.datetime.strptime(
+                properties['last-modified'],
+                '%a, %d %b %Y %H:%M:%S %Z'
+            )
+        except AzureMissingResourceHttpError:
+            pass

+ 1 - 0
data/purposeCombined/Azure/python-text-classification

@@ -0,0 +1 @@
+Subproject commit 8078e57805781f1453f1dd7ea84f8b93aa70cafa

+ 555 - 0
data/purposeCombined/Azure/storage-blob.py

@@ -0,0 +1,555 @@
+#----------------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious.  No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#----------------------------------------------------------------------------------
+
+import os
+import config
+from random_data import RandomData
+import base64
+import datetime
+import time
+from azure.storage import CloudStorageAccount, AccessPolicy
+from azure.storage.blob import BlockBlobService, PageBlobService, AppendBlobService
+from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
+from azure.storage.blob.models import BlobBlock, ContainerPermissions, ContentSettings
+#
+# Azure Storage Blob Sample - Demonstrate how to use the Blob Storage service. 
+# Blob storage stores unstructured data such as text, binary data, documents or media files. 
+# Blobs can be accessed from anywhere in the world via HTTP or HTTPS. 
+#
+ 
+# Documentation References: 
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/ 
+#  - Getting Started with Blobs - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-blob-storage/
+#  - Blob Service Concepts - http://msdn.microsoft.com/en-us/library/dd179376.aspx 
+#  - Blob Service REST API - http://msdn.microsoft.com/en-us/library/dd135733.aspx 
+#  - Blob Service Python API - http://azure.github.io/azure-storage-python/ref/azure.storage.blob.html
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/ 
+#
+class BlobAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Blob service.
+    # Input Arguments:
+    # account - CloudStorageAccount to use for running the samples
+    def run_all_samples(self, account):
+        print('\n\nAzure Storage Blob advanced sample - Starting.')
+        
+        try:
+            print('\n\n* Container operations *\n')
+            self.list_containers(account)
+
+            print('\n\n* Set CORS *\n')
+            self.set_cors_rules(account)
+
+            print('\n\n* Container lease *\n')
+            self.lease_container(account)
+
+            print('\n\n* Copy blob *\n')
+            self.copy_blob(account)
+            
+            print('\n\n* Page blob operations *\n')
+            self.page_blob_operations(account)
+            
+            print('\n\n* Block blob operations *\n')
+            self.block_blob_operations(account)
+
+            print('\n\n* Properties and Metadata operations *\n')
+            self.properties_and_metadata_operations(account)
+            
+            print('\n\n* Container ACL operations *\n')
+            self.container_acl_operations(account)
+
+            print('\n\n* Blob lease *\n')
+            self.lease_blob(account)  
+            
+            if (config.IS_EMULATED):
+                print('\nShared Access Signature is not supported in emulator');
+            else:
+                print('\n\n* Container with SAS operations *\n')
+                self.container_operations_with_sas(account)      
+  
+                print('\n\n* SAS with access policy *\n')
+                self.sas_with_container_access_policy(account)
+
+                print('\n\n* Set blob service logging and metrics properties *\n')
+                self.set_service_properties(account)
+
+        except Exception as e:
+            if (config.IS_EMULATED):
+                print('Error occurred in the sample. If you are using the emulator, please make sure the emulator is running.', e)
+            else: 
+                print('Error occurred in the sample. Please make sure the account name and key are correct.', e)
+
+        finally:
+            print('\nAzure Storage Blob advanced sample - Completed.\n')
+
+
+    # Copy a source blob to a destination blob
+    def copy_blob(self, account):
+
+        file_upload = "HelloWorld.png"
+        container_name = 'blockblobcontainer' + self.random_data.get_random_name(6)
+
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+                    
+            # Upload file as a block blob
+            print('2. Upload BlockBlob')
+            #Get full path on drive to file_to_upload by joining the fully qualified directory name and file name on the local drive
+            full_path_to_file = os.path.join(os.path.dirname(__file__), file_upload)
+            blockblob_service.create_blob_from_path(container_name, file_upload, full_path_to_file)
+
+            target_blob = "target.png"
+            blob_source_url = blockblob_service.make_blob_url(container_name, file_upload)
+
+            print('3. Copy blob')
+            blockblob_service.copy_blob(container_name, target_blob, blob_source_url)
+
+            print('4. Get target blob')
+            target_blob_properties = blockblob_service.get_blob_properties(container_name, target_blob)
+
+            print('5. Get copy properties')
+            copy_properties = target_blob_properties.properties.copy
+            
+            print('Copy properties status: ' + copy_properties.status)
+
+            if(copy_properties.status == "pending"):
+                print('6. Abort copy')
+                blockblob_service.abort_copy_blob(container_name, blob_name, copy_properties.id)
+        finally:
+            # Delete the container
+            print("7. Delete Container")
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+
+    def sas_with_container_access_policy(self, account):
+        container_name = 'demosasblobcontainer' + self.random_data.get_random_name(6)
+        
+        blockblob_service = account.create_block_blob_service()
+        
+        try:
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+            
+            print('2. Create blob "blo1" with text')
+            blockblob_service.create_blob_from_text(container_name, 'blob1', b'hello world')
+
+            print('3. Set access policy for container')
+            # Set access policy on container
+            access_policy = AccessPolicy(permission=ContainerPermissions.READ,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            acl = blockblob_service.set_container_acl(container_name, identifiers)
+
+            # Wait 30 seconds for acl to propagate
+            print('Wait 30 seconds for acl to propagate')
+            time.sleep(30)
+
+            print('4. Get sas for access policy in container')
+            # Indicates to use the access policy set on the container
+            sas = blockblob_service.generate_container_shared_access_signature(
+                container_name,
+                id='id'
+            )
+
+            print('5. Create blob service with sas')
+            # Create a service and use the SAS
+            shared_blockblob_service = BlockBlobService(
+                account_name=account.account_name,
+                sas_token=sas,
+            )
+
+            print('6. Read blob content with sas')
+            blob = shared_blockblob_service.get_blob_to_text(container_name, 'blob1')
+            content = blob.content # hello world
+        finally:
+            print('7. Delete container')
+            blockblob_service.delete_container(container_name)
+        
+        print("SAS with access policy sample completed")
+        
+    def container_operations_with_sas(self, account):
+        container_name = 'demosasblobcontainer' + self.random_data.get_random_name(6)
+        
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+        
+        # Create a Shared Access Signature for the account
+        print('1.Get account sas')
+        
+        account_sas = blockblob_service.generate_account_shared_access_signature(
+            ResourceTypes.CONTAINER + ResourceTypes.OBJECT, 
+            AccountPermissions.READ + AccountPermissions.WRITE + AccountPermissions.DELETE + AccountPermissions.LIST + AccountPermissions.CREATE, 
+            datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+
+        shared_account = CloudStorageAccount(account_name=account.account_name, sas_token=account_sas)
+        shared_account_block_service = shared_account.create_block_blob_service()
+
+        try:
+            print('2. Create container with account sas. Container name - ' + container_name)
+            shared_account_block_service.create_container(container_name)
+            
+            # For the purposes of the demo, get a Container SAS
+            # In a real-world application, the above Account SAS can be used
+            print('3. Get container sas')
+            container_sas = blockblob_service.generate_container_shared_access_signature(
+                container_name, 
+                ContainerPermissions.READ + ContainerPermissions.WRITE + ContainerPermissions.DELETE + ContainerPermissions.LIST, 
+                datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            
+            shared_container_account = CloudStorageAccount(account_name=account.account_name, sas_token=container_sas)
+            shared_container_block_service = shared_container_account.create_block_blob_service()
+            
+            print('4. Create blob with container sas')
+            shared_container_block_service.create_blob_from_text(container_name, 'myblob', 'blob data')
+            
+            print('5. List blobs with container sas')
+            blobs = shared_container_block_service.list_blobs(container_name)
+            for blob in blobs:
+                print('blob ' + blob.name)
+            
+            print('6. Delete blob with container sas')
+            shared_container_block_service.delete_blob(container_name, 'myblob')
+        finally:            
+            print('7. Delete container')
+            blockblob_service.delete_container(container_name)
+            
+        print("Containers Sas sample completed")
+        
+    def list_containers(self, account):
+        
+        container_prefix = 'blockblobcontainers' + self.random_data.get_random_name(6)
+        
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        try:
+            # Create containers
+            for i in range(5):
+                container_name = container_prefix + str(i)
+                print('1. Create a container with name - ' + container_name)
+                blockblob_service.create_container(container_name)
+            
+            # List all the blobs in the container 
+            print('2. List containers with prefix ' + container_prefix)
+            containers = blockblob_service.list_containers(container_prefix)
+            for container in containers:
+                print('\tContainer Name: ' + container.name)
+        finally:
+            # Delete the containers
+            print("3. Delete Containers")
+            for i in range(5):
+                container_name = container_prefix + str(i)
+                if blockblob_service.exists(container_name):
+                    blockblob_service.delete_container(container_name)
+            
+        print("Containers sample completed")
+
+    def container_acl_operations(self, account):
+        
+        container_name = 'aclblockblobcontainer' + self.random_data.get_random_name(6)
+        
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        try:
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+                
+            print('2. Set access policy for container')
+            access_policy = AccessPolicy(permission=ContainerPermissions.READ,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            blockblob_service.set_container_acl(container_name, identifiers)
+
+            print('3. Get access policy from container')
+            acl = blockblob_service.get_container_acl(container_name)
+
+            print('4. Clear access policy in container')
+            # Clear
+            blockblob_service.set_container_acl(container_name)
+
+        finally:            
+            print('5. Delete container')
+            blockblob_service.delete_container(container_name)
+            
+        print("Container ACL operations sample completed")
+        
+    def properties_and_metadata_operations(self, account):
+        file_blob_name = "HelloWorld.png"
+        text_blob_name = "Text"
+         
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+
+        container_name = 'blockblobbasicscontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name and custom metadata - ' + container_name)
+            blockblob_service.create_container(container_name, {'sample':'azure-storage'})
+                    
+            # Upload file as a block blob
+            print('2. Uploading BlockBlob from file with properties and custom metadata')
+            #Get full path on drive to file_to_upload by joining the fully qualified directory name and file name on the local drive
+            full_path_to_file = os.path.join(os.path.dirname(__file__), file_blob_name)
+            
+            blockblob_service.create_blob_from_path(container_name, file_blob_name, full_path_to_file, 
+                content_settings=ContentSettings(content_type='application/png'),
+                metadata={'category':'azure-samples'})
+            
+            blockblob_service.create_blob_from_text(container_name, text_blob_name, 'Data',
+                content_settings=ContentSettings(content_encoding ='UTF-8', content_language='en'),
+                metadata={'origin':'usa', 'title': 'azure-samples'})
+            
+            # Get all the container properties 
+            print('3. Get Container metadata')
+
+            container = blockblob_service.get_container_properties(container_name)
+            
+            print('    Metadata:')
+
+            for key in container.metadata:
+                print('        ' + key + ':' + container.metadata[key])
+            
+            # Get all the blob properties 
+            print('4. Get Blob properties')
+            blob = blockblob_service.get_blob_properties(container_name, file_blob_name)
+            
+            print('    Metadata:')
+            for key in blob.metadata:
+                print('        ' + key + ':' + blob.metadata[key])
+            
+            print('    Properties:')
+            print('        Content-Type:' + blob.properties.content_settings.content_type)
+        finally:            
+            # Delete the container
+            print("5. Delete Container")
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+        
+    # Set CORS
+    def set_cors_rules(self, account):
+
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+        
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+        
+        print('1. Get Cors Rules')
+        original_cors_rules =  blockblob_service.get_blob_service_properties().cors;
+        
+        try:
+            print('2. Overwrite Cors Rules')
+            blockblob_service.set_blob_service_properties(cors=[cors_rule])
+        finally:        
+            print('3. Revert Cors Rules back the original ones')
+            #reverting cors rules back to the original ones
+            blockblob_service.set_blob_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+
+    # Lease Container
+    def lease_container(self, account):
+        # Create a Block Blob Service object
+        blockblob_service = account.create_block_blob_service()
+        
+        try:
+            container_name = 'blockblobcontainer' + self.random_data.get_random_name(6)
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+
+            print('2. Acquire lease on container')
+            lease_id = blockblob_service.acquire_container_lease(container_name, lease_duration=15)
+
+            print("3. Deleted container without lease")
+            try:
+                blockblob_service.delete_container(container_name)
+            except:
+                print('Got expected exception. Cannot delete container, lease not specified')
+        finally:
+            print("4. Delete container with lease")
+            blockblob_service.delete_container(container_name, lease_id=lease_id)
+
+        print("Lease container sample completed")
+
+    # Lease Blob
+    def lease_blob(self, account):
+        blob_name = "exclusive"
+        
+        # Create an block blob service object
+        blockblob_service = account.create_block_blob_service()
+        container_name = 'blobcontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+                    
+            # Create a block blob
+            print('2. Create Block Blob')
+            blob = self.random_data.get_random_bytes(255)
+            blockblob_service.create_blob_from_bytes(container_name, blob_name, blob)
+            
+            print('3. Acquire lease on blob')
+            lease_id = blockblob_service.acquire_blob_lease(container_name, blob_name, lease_duration=15)
+            
+            # Write to a block blob
+            print('4. Try to write to Block Blob without lease')
+            block_id = self.random_data.get_random_name(32)
+            block = self.random_data.get_random_bytes(255)
+            try:
+                blockblob_service.put_block(container_name, blob_name, block, block_id)
+            except:
+                print('Got expected exception. Cannot write blob, lease not specified')
+
+            print('5. Write to Block Blob with lease')
+            blockblob_service.put_block(container_name, blob_name, block, block_id, lease_id=lease_id)
+
+            print("6. Deleted blob without lease")
+            try:
+                blockblob_service.delete_blob(container_name, blob_name)
+            except:
+                print('Got expected exception. Cannot delete blob, lease not specified')
+
+            print("7. Delete blob with lease")
+            blockblob_service.delete_blob(container_name, blob_name, lease_id=lease_id)
+        finally:
+            print("8. Delete container")
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+
+        print("Lease blob sample completed")
+        
+    #Page Blob Operations
+    def page_blob_operations(self, account):
+        file_to_upload = "HelloWorld.png"
+        page_size = 1024;
+        
+        # Create an page blob service object
+        pageblob_service = account.create_page_blob_service()
+        container_name = 'pageblobcontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            pageblob_service.create_container(container_name)
+            
+            # Create a new page blob to upload the file
+            print('2. Create a page blob')
+            pageblob_service.create_blob(container_name, file_to_upload, page_size * 1024)
+            
+            # Read the file
+            print('3. Upload pages to page blob')
+            index = 0
+            with open(file_to_upload, "rb") as file:
+                file_bytes = file.read(page_size)
+                while len(file_bytes) > 0:
+                    if len(file_bytes) < page_size:
+                        file_bytes = bytes(file_bytes + bytearray(page_size - len(file_bytes)))
+                        
+                    pageblob_service.update_page(container_name, file_to_upload, file_bytes, index * page_size, index * page_size + page_size - 1)
+                    
+                    file_bytes = file.read(page_size)
+                    
+                    index = index + 1
+            
+            pages = pageblob_service.get_page_ranges(container_name, file_to_upload)
+            
+            print('4. Enumerate pages in page blob')
+            for page in pages:
+                print('Page ' + str(page.start) + ' - ' + str(page.end))
+        finally:
+            print('5. Delete container')
+            if pageblob_service.exists(container_name):
+                pageblob_service.delete_container(container_name)
+
+    #Block Blob Operations
+    def block_blob_operations(self, account):
+        file_to_upload = "HelloWorld.png"
+        block_size = 1024
+        
+        # Create an page blob service object
+        blockblob_service = account.create_block_blob_service()
+        container_name = 'blockblobcontainer' + self.random_data.get_random_name(6)
+
+        try:
+            # Create a new container
+            print('1. Create a container with name - ' + container_name)
+            blockblob_service.create_container(container_name)
+            
+            blocks = []
+            
+            # Read the file
+            print('2. Upload file to block blob')
+            with open(file_to_upload, "rb") as file:
+                file_bytes = file.read(block_size)
+                while len(file_bytes) > 0:
+                    block_id = self.random_data.get_random_name(32) 
+                    blockblob_service.put_block(container_name, file_to_upload, file_bytes, block_id)                    
+                    
+                    blocks.append(BlobBlock(id=block_id))
+                    
+                    file_bytes = file.read(block_size)
+            
+            blockblob_service.put_block_list(container_name, file_to_upload, blocks)
+            
+            print('3. Get the block list')
+            blockslist = blockblob_service.get_block_list(container_name, file_to_upload, None, 'all')
+            blocks = blockslist.committed_blocks
+
+            print('4. Enumerate blocks in block blob')
+            for block in blocks:
+                print('Block ' + block.id)
+        finally:
+            print('5. Delete container')
+            if blockblob_service.exists(container_name):
+                blockblob_service.delete_container(container_name)
+
+    # Manage properties of the Blob service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, account):
+
+        # Create an page blob service object
+        blockblob_service = account.create_block_blob_service()
+
+        print('1. Get Blob service properties')
+        props = blockblob_service.get_blob_service_properties();
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        logging = Logging(delete=True, read=False, write=True, retention_policy=retention)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite Blob service properties')
+            blockblob_service.set_blob_service_properties(logging=logging, hour_metrics=hour_metrics, minute_metrics=minute_metrics, target_version='2015-04-05')
+        finally:
+            print('3. Revert Blob service properties back to the original ones')
+            blockblob_service.set_blob_service_properties(logging=props.logging, hour_metrics=props.hour_metrics, minute_metrics=props.minute_metrics, target_version='2015-04-05')
+
+        print('4. Set Blob service properties completed')

+ 130 - 0
data/purposeCombined/Azure/table-service.py

@@ -0,0 +1,130 @@
+import requests
+import config
+from azure import storage
+from PackageInformationWorker.PyPIPackageInformation import PyPIPackageInformation
+import json
+import azure.storage.queue as queue
+import traceback
+import urllib
+import logging
+
+logger = logging.getLogger()
+account_name = config.STORAGE_ACCOUNT_NAME
+account_key = config.STORAGE_ACCOUNT_KEY
+STATIC_ROW_KEY = 'ROWKEY'
+table_service = storage.CloudStorageAccount(account_name, account_key).create_table_service()
+table_service.create_table(config.PACKAGE_VERSION_DATA_TABLENAME)
+table_service.create_table(config.PACKAGE_SUMMARY_TABLENAME)
+
+def main():
+    # package, version = ('azure', '1.0.0')
+    # get a package to look at
+    # check that package and version.
+    # version data just gets filled in
+    # summary trickier.
+    # summary -> name,
+    #               first_published (might be different than python2_start if
+    #               not using trove classifier)
+    #               python2_start (change if we find earlier),
+    #               python2_end (change if we find earlier, remove if package
+    #               after this come in and has python2),
+    #               python3_start (change if we find earlier)
+    try:
+        qs = queue.QueueService(config.STORAGE_ACCOUNT_NAME, config.STORAGE_ACCOUNT_KEY)
+        messages_in_batch = 5
+
+        while True:
+            messages = qs.get_messages(config.PACKAGE_QUEUE_NAME,numofmessages=messages_in_batch, visibilitytimeout=messages_in_batch*60)
+            for message in messages:
+                entity = json.loads(message.message_text)
+                _process_one_package(entity["package"], entity["version"])
+                # once completed delete the message
+                qs.delete_message(config.PACKAGE_QUEUE_NAME, message.message_id, message.pop_receipt)
+    except Exception as e:
+        # swallow exception here. we will just reprocess and delete the message.
+        # known failures:
+        # - connection aborted by get_messages sometimes.  this happens with a connectionreseterror (10054)
+        # - Random json errors. Could add retry.  
+        logger.error(traceback.format_exc())
+          
+def _process_one_package(package_name, version):
+    logger.info("Worker: Package:{} Version:{}".format(package_name, version))
+    if not package_name or not version:
+        logger.warn("Package_name or version was empty. Moving on as the queue had bad data")
+        return
+
+    # .6684 seconds to run.  74577 total packages
+    package_info = PyPIPackageInformation.get_package_specific_version_info(package_name, version)
+    if not package_info:
+        logger.error("Worker: Package:{} Version:{} failed to get package info".format(package_name, version))
+        return
+
+    supports_python_2 = len([x for x in package_info['classifiers'] if x.startswith('Programming Language :: Python :: 2')]) > 0
+    supports_python_3 = len([x for x in package_info['classifiers'] if x.startswith('Programming Language :: Python :: 3')]) > 0
+    uploaded = package_info['uploaded']
+
+    try:
+        summary_entity = table_service.get_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY)
+    except:
+        # we don't have a summary for this entry.
+        summary_entity = { 
+            'PartitionKey':package_name, 'RowKey':STATIC_ROW_KEY, 'First_Published':None, 
+            'Python2_Start':None, 'Python2_End':None, 'Python3_Start':None
+            }
+        table_service.insert_or_replace_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY, summary_entity)
+        summary_entity = table_service.get_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY)
+
+    # set fields using upload. Upload is none if the version has never been uploaded
+    # Basically just filter out packages that never have content from our records.
+    if uploaded is not None:
+        if not hasattr(summary_entity, 'First_Published') or summary_entity.First_Published is None or summary_entity.First_Published > uploaded:
+            # if the published date is empty or later than the current release we
+            # are viewing update
+            summary_entity.First_Published = uploaded
+
+        if supports_python_2 and \
+            (not hasattr(summary_entity, 'Python2_Start') or summary_entity.Python2_Start is None or summary_entity.Python2_Start > uploaded):
+            # if the published date is empty or later than the date and it supports
+            # python 2
+            summary_entity.Python2_Start = uploaded
+    
+        if supports_python_2 and hasattr(summary_entity, 'Python2_End') and summary_entity.Python2_End is not None and summary_entity.Python2_End < uploaded:
+            # we support python2 but it is after the date we thought python 2
+            # support ended we must not have really ended
+            summary_entity.Python2_End = None    
+        elif hasattr(summary_entity, 'Python2_Start') and hasattr(summary_entity, 'Python2_End') and \
+            summary_entity.Python2_Start is not None and summary_entity.Python2_End is not None and \
+            (summary_entity.Python2_End > uploaded and summary_entity.Python2_Start < uploaded):
+            # if we don't support python2, and we have started supporting python2
+            # at some point
+            # and if the date we are saying we ended is after the start
+            summary_entity.Python2_End = uploaded
+
+        if supports_python_3 and \
+            (not hasattr(summary_entity, 'Python3_Start') or summary_entity.Python3_Start is None or summary_entity.Python3_Start > uploaded):
+            # if the published date is empty or later than the current release we
+            # are viewing update
+            summary_entity.Python3_Start = uploaded
+
+    version_entity = _insert_entity_to_package_version_table(package_name, version, supports_python_2, supports_python_3, package_info['downloads'], uploaded)
+    summary_entity = table_service.insert_or_replace_entity(config.PACKAGE_SUMMARY_TABLENAME, package_name, STATIC_ROW_KEY, summary_entity)
+
+def _insert_entity_to_package_version_table(package, version, python2, python3, downloads, upload_time):
+    # TODO: issue with python azure storage.  Version can't have '~' in it. https://github.com/Azure/azure-storage-python/issues/76
+    package_sanitized = urllib.parse.quote_plus(package)
+    version_sanitized = urllib.parse.quote_plus(version)
+
+    try:
+        result =  table_service.insert_or_replace_entity(config.PACKAGE_VERSION_DATA_TABLENAME, package_sanitized, version_sanitized,
+                                    {'PartitionKey' : package_sanitized,
+                                     'RowKey': version_sanitized, 
+                                     'Python2': python2, 
+                                     'Python3': python3,
+                                     'Downloads': downloads,
+                                     'UploadTime': upload_time})
+
+        return result
+    except Exception as e:
+        logger.error("Failed to insert Package:{} Version:{} Python2:{} Python3:{} Downloads:{} UploadTime:{} Exception:{}".format(
+            package, version, python2, python3, downloads, upload_time, traceback.format_exc()))
+        raise e

+ 218 - 0
data/purposeCombined/Azure/table-storage.py

@@ -0,0 +1,218 @@
+#-------------------------------------------------------------------------
+# Microsoft Developer & Platform Evangelism
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 
+# EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES 
+# OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+#----------------------------------------------------------------------------------
+# The example companies, organizations, products, domain names,
+# e-mail addresses, logos, people, places, and events depicted
+# herein are fictitious. No association with any real company,
+# organization, product, domain name, email address, logo, person,
+# places, or events is intended or should be inferred.
+#--------------------------------------------------------------------------
+import config
+import datetime
+import time
+from random_data import RandomData
+from tablestorageaccount import TableStorageAccount
+from azure.storage import CloudStorageAccount, AccessPolicy
+from azure.storage.table import TableService, Entity, TablePermissions
+from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
+
+#
+# Azure Table Service Sample - Demonstrate how to perform common tasks using the Microsoft Azure Table Service
+# including creating a table, CRUD operations and different querying techniques.
+#
+# Documentation References:
+#  - What is a Storage Account - http://azure.microsoft.com/en-us/documentation/articles/storage-whatis-account/
+#  - Getting Started with Tables - https://azure.microsoft.com/en-us/documentation/articles/storage-python-how-to-use-table-storage/
+#  - Table Service Concepts - http://msdn.microsoft.com/en-us/library/dd179463.aspx
+#  - Table Service REST API - http://msdn.microsoft.com/en-us/library/dd179423.aspx
+#  - Table Service Python API - http://azure.github.io/azure-storage-python/ref/azure.storage.table.html
+#  - Storage Emulator - http://azure.microsoft.com/en-us/documentation/articles/storage-use-emulator/
+#
+class TableAdvancedSamples():
+
+    def __init__(self):
+        self.random_data = RandomData()
+
+    # Runs all samples for Azure Storage Table service.
+    def run_all_samples(self, account):
+        table_service = account.create_table_service()
+        print('Azure Storage Advanced Table samples - Starting.')
+        
+        print('\n\n* List tables *\n')
+        self.list_tables(table_service)
+        
+        if not account.is_azure_cosmosdb_table():
+           print('\n\n* Set service properties *\n')
+           self.set_service_properties(table_service)
+        
+           print('\n\n* Set Cors rules *\n')
+           self.set_cors_rules(table_service)
+        
+           print('\n\n* ACL operations *\n')
+           self.table_acl_operations(table_service)
+        
+        if (config.IS_EMULATED):
+            print('\n\n* Shared Access Signature is not supported in emulator *\n')
+        else:
+            print('\n\n* SAS operations *\n')
+            self.table_operations_with_sas(account)
+
+        print('\nAzure Storage Advanced Table samples - Completed.\n')
+
+    # Manage tables including creating, listing and deleting
+    def list_tables(self, table_service):
+        table_prefix = 'table' + self.random_data.get_random_name(6)
+
+        try:        
+            # Create tables
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                print('1. Create a table with name - ' + table_name)
+                table_service.create_table(table_name)
+            
+            # List all the tables 
+            print('2. List tables')
+            tables = table_service.list_tables()
+            for table in tables:
+                print('\Table Name: ' + table.name)
+
+        finally:
+            # Delete the tables
+            print("3. Delete Tables")
+            for i in range(5):
+                table_name = table_prefix + str(i)
+                if(table_service.exists(table_name)):
+                    table_service.delete_table(table_name)
+            
+        print("List tables sample completed")
+    
+    # Manage properties of the Table service, including logging and metrics settings, and the default service version.
+    def set_service_properties(self, table_service):
+        print('1. Get Table service properties')
+        props = table_service.get_table_service_properties()
+
+        retention = RetentionPolicy(enabled=True, days=5)
+        logging = Logging(delete=True, read=False, write=True, retention_policy=retention)
+        hour_metrics = Metrics(enabled=True, include_apis=True, retention_policy=retention)
+        minute_metrics = Metrics(enabled=False)
+
+        try:
+            print('2. Ovewrite Table service properties')
+            table_service.set_table_service_properties(logging=logging, hour_metrics=hour_metrics, minute_metrics=minute_metrics)
+
+        finally:
+            print('3. Revert Table service properties back to the original ones')
+            table_service.set_table_service_properties(logging=props.logging, hour_metrics=props.hour_metrics, minute_metrics=props.minute_metrics)
+
+        print('4. Set Table service properties completed')
+    
+    # Manage CORS rules on the table service
+    def set_cors_rules(self, table_service):
+        cors_rule = CorsRule(
+            allowed_origins=['*'], 
+            allowed_methods=['POST', 'GET'],
+            allowed_headers=['*'],
+            exposed_headers=['*'],
+            max_age_in_seconds=3600)
+        
+        print('1. Get Cors Rules')
+        original_cors_rules = table_service.get_table_service_properties().cors
+
+        try:        
+            print('2. Overwrite Cors Rules')
+            table_service.set_table_service_properties(cors=[cors_rule])
+
+        finally:
+            #reverting cors rules back to the original ones
+            print('3. Revert Cors Rules back the original ones')
+            table_service.set_table_service_properties(cors=original_cors_rules)
+        
+        print("CORS sample completed")
+
+    # Manage table access policy
+    def table_acl_operations(self, table_service):
+        table_name = 'acltable' + self.random_data.get_random_name(6)
+
+        try:        
+            print('1. Create a table with name - ' + table_name)
+            table_service.create_table(table_name)
+                
+            print('2. Set access policy for table')
+            access_policy = AccessPolicy(permission=TablePermissions.QUERY,
+                                        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+            identifiers = {'id': access_policy}
+            table_service.set_table_acl(table_name, identifiers)
+
+            print('3. Wait 30 seconds for acl to propagate')
+            time.sleep(30)
+
+            print('4. Get access policy from table')
+            acl = table_service.get_table_acl(table_name)
+
+            print('5. Clear access policy in table')
+            table_service.set_table_acl(table_name)
+
+        finally:
+            print('5. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table ACL operations sample completed")
+    
+    # Manage shared access signature on a table
+    def table_operations_with_sas(self, account):
+        table_name = 'sastable' + self.random_data.get_random_name(6)
+        
+        try:
+            # Create a Table Service object
+            table_service = account.create_table_service()
+            
+            print('1. Create table with name - ' + table_name)
+            table_service.create_table(table_name)
+            
+            # Create a Shared Access Signature for the table
+            print('2. Get sas for table')
+            
+            table_sas = table_service.generate_table_shared_access_signature(
+                table_name, 
+                TablePermissions.QUERY + TablePermissions.ADD + TablePermissions.UPDATE + TablePermissions.DELETE, 
+                datetime.datetime.utcnow() + datetime.timedelta(hours=1))
+
+            shared_account = TableStorageAccount(account_name=account.account_name, sas_token=table_sas, endpoint_suffix=account.endpoint_suffix)
+            shared_table_service = shared_account.create_table_service()
+
+            # Create a sample entity to insert into the table
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '555-555-5555'}
+
+            # Insert the entity into the table
+            print('3. Insert new entity into table with sas - ' + table_name)
+            shared_table_service.insert_entity(table_name, customer)
+            
+            # Demonstrate how to query the entity
+            print('4. Read the inserted entity with sas.')
+            entity = shared_table_service.get_entity(table_name, 'Harp', '1')
+            
+            print(entity['email'])
+            print(entity['phone'])
+
+            # Demonstrate how to update the entity by changing the phone number
+            print('5. Update an existing entity by changing the phone number with sas')
+            customer = {'PartitionKey': 'Harp', 'RowKey': '1', 'email' : 'harp@contoso.com', 'phone' : '425-123-1234'}
+            shared_table_service.update_entity(table_name, customer)
+
+            # Demonstrate how to delete an entity
+            print('6. Delete the entity with sas')
+            shared_table_service.delete_entity(table_name, 'Harp', '1')
+
+        finally:
+            print('7. Delete table')
+            if(table_service.exists(table_name)):
+                table_service.delete_table(table_name)
+            
+        print("Table operations with sas completed")

BIN
data/purposeCombined/BI/.DS_Store


+ 47 - 0
data/purposeCombined/BI/BIL.py

@@ -0,0 +1,47 @@
+import numpy as np
+import pandas as pd 
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+warnings.filterwarnings('ignore')
+
+data=pd.read_csv('D:/Ajay/input/Suicide.csv')
+
+data=data.drop(['HDI for year','country-year'],axis=1)                  #dropping these two columns
+
+#-----Table------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+grouop_data=data.groupby(['age','sex'])['suicides_no'].sum().unstack()  #collecting data and making table using 'unstack()' function
+grouop_data=grouop_data.reset_index().melt(id_vars='age')               #arranging according to age
+grouop_data_female=grouop_data.iloc[:6,:]                               #retrieving 6 rows using 'iloc' function
+print("\n--Table of Suicides according to Female Age Groups--\n")
+from IPython.display import display
+display(grouop_data_female)                                             #displaying table
+print("\n")
+
+#-----Country vs. suicide_no-------------------------------------------------------------------------------------------------------------------------------------------
+
+suicidesNo=[]
+for country in data.country.unique():                                   
+    suicidesNo.append(sum(data[data['country']==country].suicides_no))  #getting total no of suicides of all countries
+
+suicidesNo=pd.DataFrame(suicidesNo,columns=['suicides_no'])             
+country=pd.DataFrame(data.country.unique(),columns=['country'])
+data_suicide_countr=pd.concat([suicidesNo,country],axis=1)              #definind data and axis to plot
+
+data_suicide_countr=data_suicide_countr.sort_values(by='suicides_no',ascending=False)#displaying plot in descending order(i.e. from highest no. of suicides to lowest)
+
+sns.barplot(y=data_suicide_countr.country[:20],x=data_suicide_countr.suicides_no[:20])  #displaying bars of only 20 countries with highest no. of suicides
+plt.title("20 Countries with Higest Suicide Number from 1985 to 2016")
+plt.show()
+
+#-----Population vs. Age_group-----------------------------------------------------------------------------------------------------------------------------------------
+
+index_suicide=[]
+for age in data['age'].unique():
+    index_suicide.append(sum(data[data['age']==age].suicides_no)/len(data[data['age']==age].suicides_no))  #getting suicide rate of each age group
+    
+plt.bar(['5-14 years', '15-24 years', '25-34 years', '35-54 years', '55-74 years', '75+ years'],index_suicide,align='center',alpha=0.5) #defining xticks
+plt.xticks(rotation=45)                                                 #rotating xticks by 45 degree anticlockwise
+plt.title("Suicide rates of Different Age Groups")
+plt.show()

+ 1 - 0
data/purposeCombined/BI/BusinessIntelligence-Kaggle

@@ -0,0 +1 @@
+Subproject commit 06143b2ae0538affe8029950bf36597d253bcffd

+ 606 - 0
data/purposeCombined/BI/ID3_classification.py

@@ -0,0 +1,606 @@
+# TODO mention in the report that with every level of the tree the data gets smaller and smaller.
+# NEXT STEPS
+# TODO: Create an infographic and host it on a web page.
+# TODO: Gather live data from news articles (Can try using NLTK & urllib).
+# TODO: Use Natural Language Processing to automate some of the data cleaning/integration.
+
+###################################################################################################################
+# Online Retail Analysis - ID3 CLASSIFICATION                                                                     #
+#    NOTE! Concepts will be explained with examples from the Street data set, which can be found below.           #
+#    The reason for this is because that data set is very small and easy to follow.                               #
+#                                                                                                                 #
+# 1) RESOURCES                                                                                                    #
+#    ID3 TUTORIALS:                                                                                               #
+#      1) https://sefiks.com/2017/11/20/a-step-by-step-id3-decision-tree-example/                                 #
+#      2) https://medium.com/coinmonks/what-is-entropy-and-why-information-gain-is-matter-4e85d46d2f01            #
+#                                                                                                                 #
+#    DECISION TREE TUTORIAL: https://www.lucidchart.com/pages/decision-tree                                       #
+#    ENTROPY (MORE DETAILS): https://en.wikipedia.org/wiki/Entropy_(information_theory)                           #
+#                                                                                                                 #
+# 2) DATA SETS                                                                                                    #
+#    TEST DATA SET: This data set can be found by navigating to the STREET DATA SET region in this file.          #
+#    It is a part of the ID3 file because I believe it would be useful to have an example of how the ID3 code     #
+#    works with a data set and also provides an opportunity to better understand what the code is doing.          #
+#    To have a look at ID3 applied to a small data set just make a call the test_run() function at the            #
+#    end of the file.                                                                                             #
+#                                                                                                                 #
+# 3) ALGORITHM OVERVIEW                                                                                           #
+#    Used to generate a decision tree from a given data set. It works by evaluating each attribute                #
+#    in the data set to place the nodes in an order that will return an accurate result.                          #
+#                                                                                                                 #
+# 4) USES                                                                                                         #
+#    A) Classify labeled data generally to do with NLP, approving loans and credit cards, etc.                    #
+#    B) Another non-standard use of this algorithm is to use it to fill a missing value in the data set           #
+#    during the pre-processing stage.                                                                             #
+#                                                                                                                 #
+###################################################################################################################
+
+import math
+import copy
+
+# region PERFORMANCE IMPROVEMENTS (for Python 3.8)
+""" 
+Applied: (TO DOCUMENT)
+
+TODO: 
+   1) Remove ever dict.keys() used and replace it with dict because dict.keys() creates a list of keys in memory. 
+      (More costly than looking through the dictionary itself! Further information below.)  
+      https://stackoverflow.com/questions/4730993/python-key-in-dict-keys-performance-for-large-dictionaries
+"""
+# endregion
+
+# region PLAY TENNIS DATA SET
+DATASET_BY_ATTRIB_DICT = {"outlook": ["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", "Overcast",
+                                      "Sunny", "Sunny", "Rain", "Sunny", "Overcast", "Overcast", "Rain"],
+                          "temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool",
+                                          "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
+                          "humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal",
+                                       "High", "Normal", "Normal", "Normal", "High", "Normal", "High"],
+                          "wind": ["Weak", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong",
+                                   "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Strong"]}
+
+
+# Answer as to whether or not it is a good time to play tennis.
+TARGET_ATTRIB_LIST = ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]
+
+# CONSTANT VARIABLES  # TODO: Optimise these variables by making them immutable (specifying they are const with Python)
+TARGET_ATTRIB_NAME = "play tennis"
+TRAIN_DATA_SIZE = len(TARGET_ATTRIB_LIST)
+# endregion
+
+
+# Represents a tree node and links to derived nodes.
+class Node:
+
+    def __init__(self, node_name, derived_nodes=[]):
+        self.node_name = node_name
+        self.derived_nodes = derived_nodes
+
+
+class ID3DecisionTree:
+    def __init__(self):
+        self.root_node = None
+
+        # Keeps track of all the nodes at the end of the branches that are available to link to.
+        # In this way, no code needs to be ran to find the next available space for a new node.
+        # The node at index 0 is always the one to add to first, once the new node is linked to it, it gets popped off
+        # and the new node gets appended to the end of this list.
+        self.active_branch_nodes = []
+
+        # TODO: Merge this list with the active_branch_nodes to be in dictionary format like so
+        # {attrib1: [outcome1, outcome2], attrib2: [outcome1, outcome2, outcome3]}
+        self.linked_attributes = []
+
+        # IMPORTANT NOTE:
+        # Key to understanding how the DecisionTree class works is understanding the dataset_occurrence_dict
+        # structure, as that is what is used for most calculations. This structure contains only the data from the
+        # dataset required to construct the tree. Any repetition of attribute data has been removed to reduce load.
+        # The 'dataset_occurrence_dict' structure is an unordered dictionary, where the structure itself gives more
+        # information about the dataset. For example, every attribute of the data set is a key, which contains
+        # a dictionary of its outcomes/possible values, and for each outcome, there is a dictionary showing the
+        # distribution of the outcomes for the selected target attribute.
+        # Example of dictionary structure below.
+        """ Example structure: (where 'AN'-attribute name; 'ON'-outcome name; 'TON'-target outcome name) 
+            dataset_occurrence_dict = {"AN 1": {"ON 1": {"TON 1": 1, "TON 2": 2},
+                                                "ON 2": {"TON 1": 0, "TON 2": 1},
+                                                "ON 3": {"TON 1": 0, "TON 2": 1}
+                                                },
+                                       "AN 2": {"ON 1": {"TON 1": 4, "TON 2": 0},
+                                                "ON 2": {"TON 1": 1, "TON 2": 0}
+                                                }
+                                       }
+                                                
+            The example above can be read, for attribute 1 - AN1, there are 3 outcomes - ON1, ON2, ON3. 
+            The target has 2 possible outcomes TON1 and TON2. Those values are being tracked/accounted for, 
+            for each possible outcome of each attribute. For AN1, ON1 there is 1 occurrence of TON1 and 2 occurrences of 
+            TON2. For AN1, ON2 there are 0 occurrences of TON1, and 1 occurrence of TON2 therefore the answer for this 
+            branch is TON2. Same for AN1, ON3 - answer TON2. If all the occurrences of TON1 and TON2 for attrib 1 (AN1)
+            are summed, we get the number of entries in the given data set. 
+        """
+        self.dataset_occurrence_dict = {}
+
+    # region BUILD TREE UTILITIES
+    """ Construct dataset distribution/occurrence dictionary - "dataset_occurrence_dict".
+    PARAMETERS
+      :param (dict) dataset_by_attrib_dict
+      :param (list) target_list """
+    def generate_occurrences(self, dataset_by_attrib_dict, target_list):
+        # TODO: assert that all attribute lists have the same length
+
+        # Update the dictionary with each attribute
+        for attrib_name in dataset_by_attrib_dict.keys():
+            # STEP 1: ADD the current attribute to the 'dataset_occurrence_dict' structure
+            self.dataset_occurrence_dict.update({attrib_name: {}})
+
+            # STEP 2: Fetch a list containing only the unique data from attribute_list and target_list.
+            attribute_list = dataset_by_attrib_dict[attrib_name]
+            unique_attrib_outcomes = list(set(attribute_list))
+            unique_answers = list(set(target_list))
+
+            # For each unique outcome of the current attribute
+            for attrib_outcome in unique_attrib_outcomes:
+                #   2.1) Update dictionary to store the next attribute outcome
+                self.dataset_occurrence_dict[attrib_name].update({attrib_outcome: {}})
+                # print(self.dataset_occurrence_dict)
+
+                #   2.2) For the current attribute, look at each of its outcomes and add them onto the dictionary
+                for outcome in unique_answers:
+                    self.dataset_occurrence_dict[attrib_name][attrib_outcome].update({outcome: 0})
+                    # print(self.dataset_occurrence_dict)
+
+            # STEP 3: Goes through the dataset and counts the target outcome occurrences for each attribute occurrence
+            for itter in range(len(attribute_list)):
+                #   3.1) Fetch the current attribute outcome and the current target outcome from the dataset.
+                curr_attrib_occ = attribute_list[itter]
+                curr_target_occ = target_list[itter]
+
+                #   3.2) Update the count for the current target outcome in the current attribute outcome by 1
+                self.dataset_occurrence_dict[attrib_name][curr_attrib_occ][curr_target_occ] += 1
+
+    """ After a node is added to the tree the "dataset_occurrence_dict" dictionary should be updated.
+       PARAMETERS
+         :param (list) attrib_list - the raw attrib data from the dataset.
+         :param (list) target_list - the raw target data from the dataset. """
+    def get_next_branch_occurrences(self, dataset_by_attrib_dict, target_list):
+        # This is the outcome to update the dataset_occurrence_dict by
+
+        # A completely separate dictionary from the original, this dictionary will only hold a subdictionary
+        # of the original
+        subdict = copy.deepcopy(dataset_by_attrib_dict)
+        subtar = copy.deepcopy(target_list)
+
+        indices_to_remove = []
+        attrib_to_remove = None
+
+        # Looking through every possible attribute in the dictionary
+        for attrib_key in subdict:
+            attrib_found = False
+            # Count through each list of outcomes for the given attribute.
+            for count in range(len(subdict[attrib_key])):
+                # If the active outcome name is equal to the current outcome value in the list
+                if dataset_by_attrib_dict[attrib_key][count] == self.active_branch_nodes[0].node_name:
+                    attrib_found = True
+                    # According to the algorithm, the attribute containing the currently active outcome
+                    # should be removed
+                    if attrib_key in subdict:
+                        attrib_to_remove = attrib_key
+                else:
+                    indices_to_remove.append(count)
+                    # print(subdict[attrib_key][count])
+                    # subdict[attrib_key].pop(count)
+                    # TODO: assert that there is only one 0 in the list otherwise it is trying to remove the wrong values
+
+            if attrib_found:
+                break
+
+        # Processing the subdict data
+        #print("Subdict: ", subdict)
+        del subdict[attrib_to_remove]
+
+        for attrib in subdict:
+            #print("Discarding data in ", attrib)
+            complete_list = subdict[attrib]
+
+            sublist = [value for index, value in enumerate(complete_list) if index not in indices_to_remove]
+            subdict[attrib] = sublist
+
+        #print("After processing the data: ", subdict)
+
+        # Processing the subtar data
+        #print("Discarding data in target list")
+        #print("Target data before processing: ", subtar)
+        # print(indices_to_remove)
+        subtar = [value for index, value in enumerate(subtar) if index not in indices_to_remove]
+        #print("Target data after processing: ", subtar)
+
+        # TODO: Call this function recursively on each branch, pass in the shrinked dictionary
+        # TODO: test the base case thoroughly
+        # TODO: Build a new dataset_by_attrib_dict for the current outcome
+        # TODO: REMOVE outlook from the dataset dict when all its outcomes have children nodes assigned
+        # (How to know if an attribute is complete???)
+
+        return subdict, subtar
+
+    """ Checks if a branch is complete, i.e. the target outcome was found. 
+    PARAMETERS
+      :param  (dict) target_val_dist_for_attrib 
+      :returns (list) comp_branches - contains all the target outcomes reached for the given attribute."""
+    def track_target_outcomes(self, target_val_dist_for_attrib):
+        comp_branches = []
+
+        # Looks through each attribute outcome
+        for attrib_outcome_key in target_val_dist_for_attrib.keys():
+
+            # Tracks how many non-zero occurrences of a target outcome there are for this attribute outcome.
+            non_zero_outcome_count = 0
+
+            # This variable is set to the target outcome if the branch outcome is (100%) certain.
+            branch_answer = None
+
+            # Checks what the distribution of target outcomes is for the current attribute outcome.
+            # Ex: question - how sdo people drive based on the terrain, if the terrain is flat do they drive slow
+            # or fast, and what is it if the terrain is steep.
+            # Target outcomes - fast and slow; attrib outcomes - flat and steep.
+            # Distribution dictionary looks like this ->{'fast': {'slow': 0, 'fast': 1}, 'steep':{'slow': 2, 'fast': 1}}
+            for target_outcome_key in target_val_dist_for_attrib[attrib_outcome_key].keys():
+
+                # Fetch the number of occurrences for each target outcome for the current attribute
+                """"Another Example: if the target is can_buy_computer(possible values/outcomes: Yes or No) and the current 
+                attribute is age (possible values/outcomes:  <=30, 31..40 and >40) this will return how many of the entries 
+                where age is <=30 are no, then how many of the entries where age is <=30 are yes, then how many 
+                of the entries where age is 31..40 are yes and so on, until all cases are looked at. """
+                outcome_occurrences = target_val_dist_for_attrib[attrib_outcome_key][target_outcome_key]
+
+                # Check if the answer is certain and end the branch, i.e. count how many branches have
+                # certain target outcome
+                if outcome_occurrences > 0:
+                    non_zero_outcome_count += 1
+
+                    if non_zero_outcome_count == 1:
+                        branch_answer = target_outcome_key
+
+            if non_zero_outcome_count == 0:
+                print("INVALID RESULT!")
+            elif non_zero_outcome_count == 1:
+                print("THE ANSWER FOR <<", attrib_outcome_key, ">> is <<", branch_answer, ">>")
+                comp_branches.append({attrib_outcome_key: branch_answer})
+            elif non_zero_outcome_count > 1:
+                print("THE BRANCH <<", attrib_outcome_key, ">> IS STILL ACTIVE!")
+
+        return comp_branches
+
+    # Counts the occurrences of each value for a given attribute.
+    def count_value_occ(self, unique_values, attrib_data):
+        attrib_val_occ = {}
+
+        # Construct dictionary
+        for value in unique_values:
+            attrib_val_occ.update({value: 0})
+
+        # Initialise Dictionary
+        for u_value in unique_values:
+            attrib_val_occ[u_value] = attrib_data.count(u_value)
+
+        return attrib_val_occ
+
+    def calc_entropy(self, attrib_uv_count, overall):
+        entropy = 0
+        # print("UV: ", attrib_uv_count)
+
+        for key in attrib_uv_count.keys():
+
+            # if there is some occurrence of the value calculate entropy,
+            # otherwise ignore it (when there is 0 occurrences of the value)
+            if attrib_uv_count[key] != 0:
+                fraction = attrib_uv_count[key] / overall
+                target_attrib_calc = fraction * math.log2(fraction)
+
+                entropy += target_attrib_calc
+
+        return abs(entropy)
+
+    def calc_attrib_entropy(self, attrib_occurrences):
+        entropy_list = {}
+
+        for attrib_val_key in attrib_occurrences.keys():
+            attrib_val = attrib_occurrences[attrib_val_key]
+            overall = 0
+            for target_values in attrib_val.values():
+                overall += target_values
+
+            print("CALC TARGET ENTROPY FOR EACH ATTRIB OUTCOME: ", attrib_val)
+            attrib_entropy = self.calc_entropy(attrib_val, overall)
+            entropy_list.update({attrib_val_key: attrib_entropy})
+
+        print("Entropy list: ", entropy_list)
+
+        return entropy_list
+
+    # WEIGHTED AVERAGE ENTROPY for the children
+    def calc_entropy_weigh_avg(self, target_val_dist_attrib, overall, attrib_entropy):
+        weighted_entropy_avg = 0
+        for key in target_val_dist_attrib.keys():
+            curr_value = 0
+
+            for value in target_val_dist_attrib[key].values():
+                curr_value += value
+            weighted_entropy_avg += curr_value / overall * attrib_entropy[key]
+            # overall += curr_value
+
+        return weighted_entropy_avg
+
+    def calc_info_gain(self, target_entropy, target_dist_for_attrib):
+
+        # CALCULATE ENTROPY OF Attribute
+        attrib_entropy = self.calc_attrib_entropy(target_dist_for_attrib)
+        # print("Attrib Entropy: ", attrib_entropy)
+
+        weighted_avg_e = self.calc_entropy_weigh_avg(target_dist_for_attrib, TRAIN_DATA_SIZE, attrib_entropy)
+        # print("Attrib Weighted AVG: ", weighted_avg_e)
+
+        attrib_info_gain = target_entropy - weighted_avg_e
+
+        return attrib_info_gain
+
+    # IMPORTANT NOTE: An attribute node should always be made together with its outcomes, never an outcome alone
+    # as it is not how this function was setup.
+    # :param (str) name - should always be the name of an attribute.
+    def build_node(self, name, completed_branches):
+        attrib_node = Node(name)
+        derived_nodes = []
+
+        completed_outcomes = []
+        for branch in completed_branches:
+            completed_outcomes.append(list(branch.keys())[0])
+
+        # if all outcome branches for thi attribute are completed, then the attribute is complete and its outcomes
+        # should be popped off the active_branch_nodes list
+        # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> CHECK COMPLETE ATTRIB: ", completed_branches)
+
+        # print(self.dataset_occurrence_dict[name].keys())
+        for outcome_name in self.dataset_occurrence_dict[name]:
+            new_outcome_node = Node(outcome_name)
+            # print("STATUS: NEW OUTCOME NODE CREATED")
+
+            # Check if the branch for the current outcome is complete (Target answer is 100% certain).
+            for branch in completed_branches:
+                if outcome_name in branch:
+                    # print("FOUND OUTCOME <<", outcome_name, ">> in ", branch)
+
+                    if len(new_outcome_node.derived_nodes) == 0:
+                        # Formally end the node
+                        endpoint_node = Node(branch[outcome_name], None)
+                        new_outcome_node.derived_nodes.append(endpoint_node)
+                        # print("STATUS: NEW OUTCOME ENDPOINT NODE CREATED & LINKED")
+
+            # The temp_outcome node is created so that the outcome node stored in the tree and the outcome node stored
+            # in the active_branch_nodes list are the same. This is important because I never append directly onto the
+            # tree but to a reference of the active branch of the tree. This allows to append to any depth of the tree
+            # without needing to do any traversal to find the next available node.
+            temp_outcome = copy.deepcopy(new_outcome_node)
+            derived_nodes.append(temp_outcome)
+
+            # If the branch is still active/available to add to
+            if outcome_name not in completed_outcomes:
+                # Add the new node to the active branch list
+                self.active_branch_nodes.append(temp_outcome)
+            """print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Completed Nodes:", acc_completed)
+        acc_completed[name]["completed"] = True
+        all_outcomes_list = list(self.dataset_occurrence_dict[name].keys())
+
+        for outcome in all_outcomes_list:
+                if outcome in acc_completed[name]["outcomes"]:
+                    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ", outcome, " TRUE")
+                else:
+                    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>> ", outcome, " FALSE")
+                    acc_completed[name]["completed"] = False
+
+            print(all_outcomes_list)"""
+
+            new_outcome_node.derived_nodes.clear()
+
+        # print("STATUS: NEW NODE CREATED")
+        attrib_node.derived_nodes = derived_nodes
+        return attrib_node
+
+    # IMPORTANT NODE: active_branch_nodes is only updated when build_node function is called, therefore
+    # the link will not be appropriate unless the node was created through the build_node function.
+    def link_node(self, new_node):
+        """
+        print("  <<< CHECKING IF THE TREE SEGMENT IS BUILT RIGHT! >>>    ")
+        # TEMP
+        print("ATTRIBUTE/PARENT NODE: ", new_node.node_name)
+        print("DERIVED NODES LIST: ", new_node.derived_nodes)
+
+        print("FOR EACH NODE IN DERIVED NODES.")
+        for node in new_node.derived_nodes:
+            print("\t OUTCOME NODE FOR ATTRIB: ", node.node_name)
+            for other in node.derived_nodes:
+                print("\t\t TARGET OUTCOME REACHED: ", other.node_name)"""
+        if self.root_node is None:
+            self.root_node = new_node
+
+        else:
+            # Add the new node to the tree
+            # I hard coded 0 as the active node index because index 0 is always the next available node to link to.
+            self.active_branch_nodes[0].derived_nodes.append(new_node)
+
+            # Update the available nodes!
+            # The node at index 0 is already taken so that node should be popped off
+            self.active_branch_nodes.pop(0)
+
+    # Builds a part of the tree (attribute node with setup derived nodes/outcome nodes) and links it to the tree.
+    def build_tree_chunk(self, dataset_by_attrib_dict, target_attrib_list):
+        self.generate_occurrences(dataset_by_attrib_dict, target_attrib_list)
+        # print("Main DICTIONARY", self.dataset_occurrence_dict)
+
+        # TARGET ATTRIBUTE CALCULATIONS - Required for the calculation of info_gain for the rest of the attributes.
+        target_uv_data = list(set(target_attrib_list))  # TODO: POSSIBLE EFFICIENCY DECREASE
+        target_uv_count = self.count_value_occ(target_uv_data, target_attrib_list)
+        # print("Target Unique Value Count: ", target_uv_count)
+
+        target_entropy = self.calc_entropy(target_uv_count, TRAIN_DATA_SIZE)
+        # print("TARGET ENTROPY: ", target_entropy)
+
+        # Build each node(calc its entropy and info_gain, and assigning each attributes outcomes as children)
+        # store the node in the node list and sort the nodes by info_gain to build the tree with them.
+        next_node_data = {"name": None, "info gain": 0, "completed": None}
+
+        for attrib_name in self.dataset_occurrence_dict.keys():
+            print("\n", "-" * 50)
+
+            # ATTRIB CALCULATIONS
+            print("attrib_name: ", attrib_name)
+
+            # Contains a data structure representing the target attribute's value distribution
+            # with regard to another attribute
+            target_dist_for_attrib = self.dataset_occurrence_dict[attrib_name]
+            # print("Target occurrences: ", target_dist_for_attrib)
+
+            # Check if any of the branches is completed
+            completed_branches = self.track_target_outcomes(target_dist_for_attrib)
+            print("COMPLETED BRANCHES: ", completed_branches)
+
+            attrib_info_gain = self.calc_info_gain(target_entropy, target_dist_for_attrib)
+            # print("The INFO GAIN for <<", attrib_name, ">> is ", attrib_info_gain)
+
+            if next_node_data["info gain"] < attrib_info_gain:
+                next_node_data["name"] = attrib_name
+                next_node_data["info gain"] = attrib_info_gain
+                next_node_data["completed"] = completed_branches
+
+        print("------> The next new node is: ", next_node_data["name"], "\n\n")
+        new_node = self.build_node(next_node_data["name"], next_node_data["completed"])
+        self.link_node(new_node)
+
+    # endregion
+
+    def build_tree(self, dataset_by_attrib_dict, target_attrib_list):
+
+        self.build_tree_chunk(dataset_by_attrib_dict, target_attrib_list)
+        print("\n\n")
+
+        while len(self.active_branch_nodes) != 0:
+            print(">>>>>>>>>>>>>>>>>>> Current active node: ", self.active_branch_nodes[0].node_name)
+            # self.linked_attrib_names
+            sub_attrib_dict, sub_tar_list = self.get_next_branch_occurrences(dataset_by_attrib_dict, target_attrib_list)
+            self.build_tree_chunk(sub_attrib_dict, sub_tar_list)
+            print("\n\n>>>>>>>>>>>>>>>>>>> List of active nodes: ", self.active_branch_nodes)
+
+        print("\n\n", "<"*5, "THE TREE IS COMPLETE!", ">"*5, "\n\n")
+
+    def visualise_tree(self):
+        current_node = self.root_node
+        while current_node is not None:
+            print(current_node.node_name)
+
+            # TODO this recursively, base case -> len(node.derived_nodes) == 0
+            # EXTRA TODO pass in a variable called branch_track that will start off as "",
+            # each time a recursion is spawned add a "\t", that way the print will have a sort of a hiearchy
+
+    # This function runs classification on one entry and returns the answer.
+    # Should only be called after the tree model was built.
+    def classify(self, entry_index, dataset_by_attrib_dict):
+        answer = None
+
+        # TODO: assert that root node is not none
+        current_node = self.root_node
+
+        while current_node.derived_nodes is not None:
+            print("\n  <<< TRAVERSING TREE >>>  ")
+            print("Current Attrib: ", current_node.node_name)
+
+            # Ask the tree which attribute/column to look for first
+            column_name = current_node.node_name
+
+            # Fetch the value for the given entry (entry_index) from the column identified by the tree.
+            current_outcome_name = dataset_by_attrib_dict[column_name][entry_index]
+            print("\tCurrent outcome name: ", current_outcome_name)
+
+            # Get that node from the derived nodes list
+            for outcome_node in current_node.derived_nodes:
+                if outcome_node.node_name == current_outcome_name:
+                    # print("\n  <<< TRAVERSING TREE >>>  ")
+                    # print("FOUND VALUE FOR ENTRY <<", entry_index, ">>  ->  <<", outcome_node.node_name, ">>")
+                    current_node = outcome_node.derived_nodes[0]
+                    # print("Current Attrib: ", current_node.node_name)
+                    answer = current_node.node_name
+
+        print("    <<< FOUND VALUE >>>  ")
+        print("    The answer is: ", answer)
+
+        return answer
+
+
+def test_run_algorithm():
+    print(" "*10, " << ID3 CLASSIFICATION ALGORITHM >> ", " "*10)
+
+    tree = ID3DecisionTree()
+    tree.build_tree(DATASET_BY_ATTRIB_DICT, TARGET_ATTRIB_LIST)
+
+    # APPLY CLASSIFICATION
+    # The index of the entry in the dataset.
+    entry_index = 0
+    tree.classify(entry_index, DATASET_BY_ATTRIB_DICT)
+
+
+test_run_algorithm()
+
+"""
+# Remove the completed branches
+for branch in completed_branches:
+    for key in branch.keys():
+        target_val_dist_for_grade.pop(key)
+
+print("After removing completed branches: ", target_val_dist_for_grade)
+"""
+
+# region Build Decision Tree
+
+# endregion
+
+""" 
+What is "Training Data"? 
+    Building the tree is done with training data which already has the answer to whatever question is being asked. 
+    the example given with the data on the slides that asks if someone can buy a laptop is training data
+    because it already knows the answer.
+"""
+"""
+Apply information gain function to each attribute calculate_gain(attr_out)
+Should that be applied to the target as well? No
+Example:
+    - G(train_data, O) = 0.246
+    - G(train_data, H) = 0.151
+    - G(train_data, W) = 0.048
+
+Once the root node is known, look at how many unique values are there.
+If there are 4 possible values and they are not numbers, 
+for example "Sunny", "Rainy", etc. there should be 4 nodes. 
+"""
+
+# region Apply Classification
+"""
+What is "Test Data"?
+    Test data is when we get a new entry and we want to classify it. 
+    For example: In the bank they may use an already trained ID3 algorithm to check if you should get a credit card or not.
+    They will have different attributes like - number of times you have gone bankrupt; what is your current net worth; 
+    are you a student;  what is your credit score; etc.
+    Then the target attribute will be EligibleForCreditCard(True or False)
+"""
+
+# Use the built decision tree to look through a row of data from the data set. This is done using test data.
+# (How to evaluate if the classification has an error?)
+""" 
+Steps: 
+    1. Find which is the current attribute to look through (To start with ask the tree which attribute is the root node)
+        1.1 (When building the tree need to make sure the attributes have the exact same name as the Node data)
+        1.2 Search trough all possible attributes
+        1.3 Check if the attribute name == the node name
+        
+    2. Find the attribute value for the current row
+        2.1 Ask the data set which value is given for this attribute
+        2.2 Find the which of the children nodes in the tree are equivalent to the given value
+        
+    Repeat these steps recursively until an answer is found. 
+"""
+# endregion

+ 336 - 0
data/purposeCombined/BI/Practica2.py

@@ -0,0 +1,336 @@
+# -*- coding: utf-8 -*-
+
+"""
+Autor:
+    Francisco Solano López Rodríguez
+Fecha:
+    Noviembre/2018
+Contenido:
+    Practica 2 Clustering
+    Inteligencia de Negocio
+    Grado en Ingeniería Informática
+    Universidad de Granada
+"""
+
+import time
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+
+from sklearn.cluster import KMeans
+from sklearn.cluster import KMeans, AgglomerativeClustering,estimate_bandwidth
+from sklearn.cluster import Birch,SpectralClustering,MeanShift,DBSCAN, MiniBatchKMeans
+from sklearn import metrics
+from sklearn import preprocessing
+from math import floor
+import seaborn as sns
+from scipy.cluster.hierarchy import dendrogram,ward
+
+seed = 12345
+
+################### FUNCIONES ###########################
+
+def getPrediction(algorithm, X):
+    t = time.time()
+    cluster_predict = algorithm.fit_predict(X) 
+    tiempo = time.time() - t
+
+    return cluster_predict, tiempo
+
+# Función para obtener las medias de cada cluster
+def getMeans(dataFrame):
+    return dataFrame.groupby("cluster").mean()
+
+# Función para obtener las desviaciones de cada cluster
+def getStd(dataFrame):
+    return dataFrame.groupby("cluster").std()
+
+# Función para pintar Scatter Matrix 
+def DrawScatterMatrix(data, name=None, display=True, save=False):
+    sns.set()
+    variables = list(data)
+    variables.remove('cluster')
+    sns_plot = sns.pairplot(data, vars=variables, hue="cluster", palette='Paired', plot_kws={"s": 25},
+                            diag_kind="hist") 
+    sns_plot.fig.subplots_adjust(wspace=.03, hspace=.03)
+
+    if name != None:        
+        plt.title("scatter_"+name)
+
+    # Mostrar imagen por pantalla
+    if display:
+        plt.show()
+
+    # Guardar imagen en memoria
+    if save:        
+        if name == None:
+            name = "_unknown_"
+        image_name = "scatter/scatter_" + name + ".png"
+        plt.savefig(image_name)
+        plt.clf()
+        print("Imagen guardada: ", image_name)
+
+# Función para pintar heatmap
+def DrawHeatmap(data, name = None, display=True, save = False):
+    data_normal = data.apply(norm_to_zero_one)
+    meanDF = getMeans(dataFrame = data_normal)
+    hm = sns.heatmap(data=meanDF, linewidths=.1, cmap="Blues", annot=True, xticklabels='auto')
+    plt.xticks(rotation=0)
+    plt.title("heatmap_"+name)
+
+    if name != None:        
+        plt.title("heatmap_"+name)
+
+    # Mostrar imagen por pantalla
+    if display:
+        plt.show()
+
+    # Guardar imagen en memoria
+    if save:
+        if name == None:
+            name = "_unknown_"
+        image_name = "heatmap/heatmap_" + name + ".png"
+        plt.savefig(image_name)
+        plt.clf()
+        print("Imagen guardada: ", image_name)
+
+# Función para pintar dendograma
+def DrawDendrogram(data, name = None, display=True, save = False):
+    data_normal = preprocessing.normalize(data,norm='l2')
+    linkage_array = ward(X_normal)
+
+    dendrogram(linkage_array,leaf_rotation=90., leaf_font_size=5.)
+    
+    if name != None:        
+        plt.title("dendograma_" + name)
+
+    # Mostrar imagen por pantalla
+    if display:
+        plt.show()
+
+    # Guardar imagen en memoria
+    if save:
+        if name == None:
+            name = "_unknown_"
+        image_name = "dendrogram/dendrogram_" + name + ".png"
+        plt.savefig(image_name)
+        plt.clf()
+        print("Imagen guardada: ", image_name)
+
+def dataFrameResultados(algoritmos, num_cluster, metrics_CH, metrics_SC, tiempos):
+    df_algo = pd.DataFrame(algoritmos, columns=['Algoritmo'])
+    df_nc = pd.DataFrame(num_cluster, columns=['Num. Clusters'])
+    df_CH = pd.DataFrame(metrics_CH, columns=['CH'])
+    df_SC = pd.DataFrame(metrics_SC, columns=['SH'])
+    df_t = pd.DataFrame(tiempos, columns=['Tiempo'])
+
+    resultados = pd.concat([df_algo, df_nc, df_CH, df_SC, df_t], axis=1)
+
+    return resultados
+
+def norm_to_zero_one(df):
+    return (df - df.min()) * 1.0 / (df.max() - df.min())
+
+
+def executeClustering(algorithms, X, caso):
+
+    f = open("caso_" + str(caso) + ".txt", 'w')
+
+    X_normal = X.apply(norm_to_zero_one)
+
+    names = []
+    num_cluster = []
+    metrics_CH = []
+    metrics_SC = []
+    tiempos = []
+
+    print("\nCaso de estudio ", caso, ", tamaño: ", len(X))
+    f.write("\nCaso de estudio " + str(caso) + ", tamaño: " + str(len(X)))
+
+    for algorithm, name_algorithm in algorithms:
+
+        print("\n----------------------------------------\n")
+        print("Ejecutando algoritmo: ", name_algorithm, "\n")
+        f.write("\n--------------------------------------\n")
+        f.write("Ejecutando algoritmo: " + name_algorithm + "\n")        
+        # Ejecución algoritmo clustering
+        cluster_predict, tiempo = getPrediction(algorithm, X_normal)
+
+        # Pasar las predicciones a dataFrame
+        clusters = pd.DataFrame(cluster_predict,index=X.index,columns=['cluster'])
+
+        print("Tamaño de cada cluster:")
+        f.write("\nTamaño de cada cluster:\n")
+        size=clusters['cluster'].value_counts()
+
+        for num,i in size.iteritems():
+           print('%s: %5d (%5.2f%%)' % (num,i,100*i/len(clusters)))
+           f.write('%s: %5d (%5.2f%%)\n' % (num,i,100*i/len(clusters)))
+        print()
+
+        # Obtener los resultados de las métricas
+        metric_CH = metrics.calinski_harabaz_score(X_normal, cluster_predict)
+        metric_SC = metrics.silhouette_score(X_normal, cluster_predict, metric='euclidean', 
+                                         sample_size=floor(0.2*len(X)), random_state=seed)
+
+        # Guardamos el nombre del algoritmo, número de cluster, 
+        # los tiempos y las métricas para la posterior comparacion 
+        names.append(name_algorithm)   
+        num_cluster.append(len(set(cluster_predict)))
+        metrics_CH.append(metric_CH)
+        metrics_SC.append(metric_SC)
+        tiempos.append(tiempo)
+
+        # Se añade la asignación de clusters como columna a X
+        X_cluster = pd.concat([X, clusters], axis=1)
+        X_normal_cluster = pd.concat([X_normal, clusters], axis=1)
+
+        name = "caso_" + str(caso) + "_" + name_algorithm  
+
+        # Pintamos el scatter matrix
+        DrawScatterMatrix(data = X_cluster, name = name, display = False, save = True)
+
+        # Pintamos el heatmap
+        DrawHeatmap(data = X_cluster, name = name, display = False, save = True)
+
+        # DataFrame con la media de cada característica en cada cluster
+        meanDF = getMeans(dataFrame = X_cluster)
+        print()
+        print(meanDF)
+        f.write(meanDF.to_string())
+
+        # Si el algoritmo es AgglomerativeClustering pintamos el dendograma
+        if name_algorithm == 'AC':
+            DrawDendrogram(data = X_cluster, name = name, display = False, save = True)
+
+
+    resultados = dataFrameResultados(names, num_cluster, metrics_CH, metrics_SC, tiempos)
+
+    print("\n**************************************\n")
+    print(resultados.to_string())
+    print("\n**************************************\n")
+
+    f.write("\n**************************************\n")
+    f.write(resultados.to_string())
+    f.write("\n**************************************\n")
+
+    f.close()
+
+
+#########################################################
+
+# Lectura datos
+
+print("Leyendo el conjunto de datos...")
+censo = pd.read_csv('censo_granada.csv')
+censo = censo.replace(np.NaN,0) 
+print("Lectura completada.")
+
+
+###### CASOS DE ESTUDIO ######
+
+#-------- CASO 1 --------
+
+casado = 2
+hombre = 1
+mujer = 6
+
+subset = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
+usadas = ['EDAD', 'NPFAM', 'HM5', 'H0515']
+X = subset[usadas]
+X_normal = preprocessing.normalize(X, norm='l2')
+
+#-------- CASO 2 --------
+
+subset_2 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==hombre)]
+usadas_2 = ['EDAD', 'NPFAM', 'HM5', 'H0515']
+X_2 = subset_2[usadas_2]
+X_normal_2 = X_2.apply(norm_to_zero_one)
+
+#-------- CASO 3 --------
+
+subset_3 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
+usadas_3 = ['EDAD', 'NPFAM', 'NHIJOS', 'ESREAL']
+X_3 = subset_3[usadas_3]
+X_normal_3 = X_3.apply(norm_to_zero_one)
+
+###############################
+
+# Obtener la correlación entre las variables
+'''
+correlation = X.corr()
+sns.heatmap(correlation, square = True)
+plt.show()
+'''
+
+#################### Algoritmos #####################
+
+random_seed = 123
+
+k_means = KMeans(init='k-means++', n_clusters=5, n_init=5, random_state=random_seed)
+
+agglo=AgglomerativeClustering(n_clusters=5,linkage="ward")
+
+meanshift = MeanShift(bin_seeding=True)
+
+miniBatchKMeans = MiniBatchKMeans(init='k-means++',n_clusters=4, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
+
+dbscan = DBSCAN(eps=0.2)
+
+dbscan2 = DBSCAN(eps=0.1)
+
+algorithms = [(k_means, "KMeans"),
+              (agglo, "AC"),
+              (meanshift, "MeanShift"), 
+              (miniBatchKMeans, "MiniBatchKM"),
+              (dbscan, "DBSCAN")]
+
+algorithms2 = [(k_means, "KMeans"),
+              (agglo, "AC"),
+              (meanshift, "MeanShift"), 
+              (miniBatchKMeans, "MiniBatchKM"),
+              (dbscan2, "DBSCAN2")]
+
+
+# Kmeans con diferentes números de cluster
+
+algorithm_kmeans = []
+
+for i in range(5,9):
+    kmeans_i = KMeans(init='k-means++', n_clusters=i, n_init=5)
+    algorithm_kmeans.append((kmeans_i, "KMeans_" + str(i)))
+
+# AgglomerativeClustering con diferentes números de cluster
+
+algorithm_AC = []
+
+for i in range(5,9):
+    agglo_i = AgglomerativeClustering(n_clusters=i,linkage="ward")
+    algorithm_AC.append((agglo_i, "AC_" + str(i)))
+
+# MiniBatchKmeans con diferentes números de cluster
+
+algorithm_miniBatch = []
+
+for i in range(5,9):
+    miniBatch_i = MiniBatchKMeans(init='k-means++',n_clusters=i, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
+    algorithm_miniBatch.append((miniBatch_i, "MiniBatchKM_" + str(i)))
+
+#-----------------------------------------------------#
+
+# EJECUCIÓN CASO 1
+executeClustering(algorithms, X, 1)
+executeClustering(algorithm_kmeans, X, 1.1)
+executeClustering(algorithm_AC, X, 1.2)
+
+# EJECUCIÓN CASO 2
+executeClustering(algorithms, X_2, 2)
+executeClustering(algorithm_kmeans, X_2, 2.1)
+executeClustering(algorithm_miniBatch, X_2, 2.2)
+
+# EJECUCIÓN CASO 3
+executeClustering(algorithms2, X_3, 3)
+executeClustering(algorithm_kmeans, X_3, 3.1)
+executeClustering(algorithm_miniBatch, X_3, 3.2)
+

+ 132 - 0
data/purposeCombined/BI/apriori.py

@@ -0,0 +1,132 @@
+# author: Justin Cui
+# date: 2019/10/23
+# email: 321923502@qq.com
+
+
+from numpy import *
+
+
+def load_data():
+    dataSet = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
+               ['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
+               ['socks', 'gloves'],
+               ['bread', 'milk', 'shoes', 'socks', 'eggs'],
+               ['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
+               ['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
+    return dataSet
+
+
+# 扫描全部数据,产生c1
+def create_c1(data):
+    c1 = []
+    for transaction in data:
+        for item in transaction:
+            if [item] not in c1:
+                c1.append([item])
+    c1.sort()
+    return list(map(frozenset, c1))
+
+
+# 由c(i)生成对应的l(i)
+def c2l(data, ck, min_support):
+    dict_sup = {}
+    for i in data:
+        for j in ck:
+            if j.issubset(i):
+                if j not in dict_sup:
+                    dict_sup[j] = 1
+                else:
+                    dict_sup[j] += 1
+    support_data = {}
+    result_list = []
+    for i in dict_sup:
+        temp_sup = dict_sup[i] / len(data)
+        if temp_sup >= min_support:
+            result_list.append(i)
+            support_data[i] = temp_sup
+    return result_list, support_data
+
+
+# 由l(k-1)生成c(k)
+def get_next_c(Lk, k):
+    result_list = []
+    len_lk = len(Lk)
+    for i in range(len_lk):
+        for j in range(i + 1, len_lk):
+            l1 = list(Lk[i])[:k]
+            l2 = list(Lk[j])[:k]
+            if l1 == l2:
+                a = Lk[i] | Lk[j]
+                a1 = list(a)
+                b = []
+                for q in range(len(a1)):
+                    t = [a1[q]]
+                    tt = frozenset(set(a1) - set(t))
+                    b.append(tt)
+                t = 0
+                for w in b:
+                    if w in Lk:
+                        t += 1
+                if t == len(b):
+                    result_list.append(b[0] | b[1])
+    return result_list
+
+
+# 得到所有的l集
+def get_all_l(data_set, min_support):
+    c1 = create_c1(data_set)
+    data = list(map(set, data_set))
+    L1, support_data = c2l(data, c1, min_support)
+    L = [L1]
+    k = 2
+    while (len(L[k - 2]) > 0):
+        Ck = get_next_c(L[k - 2], k - 2)
+        Lk, sup = c2l(data, Ck, min_support)
+        support_data.update(sup)
+        L.append(Lk)
+        k += 1
+    del L[-1]
+    return L, support_data
+
+
+# 得到所有L集的子集
+def get_subset(from_list, result_list):
+    for i in range(len(from_list)):
+        t = [from_list[i]]
+        tt = frozenset(set(from_list) - set(t))
+        if tt not in result_list:
+            result_list.append(tt)
+            tt = list(tt)
+            if len(tt) > 1:
+                get_subset(tt, result_list)
+
+
+# 计算置信度
+def calc_conf(freqSet, H, supportData, min_conf):
+    for conseq in H:
+        conf = supportData[freqSet] / supportData[freqSet - conseq]
+        lift = supportData[freqSet] / (supportData[conseq] * supportData[freqSet - conseq])
+        if conf >= min_conf and lift > 1:
+            print(set(freqSet - conseq), '-->', set(conseq), '支持度', round(supportData[freqSet - conseq], 2), '置信度:',
+                  conf)
+
+
+# 生成规则
+def gen_rule(L, support_data, min_conf=0.7):
+    for i in range(len(L)):
+        print("\n", i + 1, "-频繁项集为:")
+        for freqSet in L[i]:
+            print(set(freqSet), end="  ")
+    print("\n")
+    for i in range(1, len(L)):
+        for freqSet in L[i]:
+            H1 = list(freqSet)
+            all_subset = []
+            get_subset(H1, all_subset)
+            calc_conf(freqSet, all_subset, support_data, min_conf)
+
+
+if __name__ == '__main__':
+    dataSet = load_data()
+    L, supportData = get_all_l(dataSet, 0.5)
+    gen_rule(L, supportData, 0.6)

+ 440 - 0
data/purposeCombined/BI/bi_main.py

@@ -0,0 +1,440 @@
+"""
+This OOP is to do the BI Challenge
+"""
+from warnings import simplefilter
+simplefilter(action='ignore', category=FutureWarning)
+# %matplotlib inline
+from google.colab import files
+import pandas as pd
+import numpy as np
+# %reload_ext sql
+import sqlite3
+import seaborn as sns
+import matplotlib.pyplot as plt
+from plotly.offline import iplot
+import plotly.express as px
+
+pd.options.display.float_format = '{:.2f}'.format # uppress scientific notation
+# Declare your Github Repository address
+A_url='https://raw.githubusercontent.com/haensel-ams/recruitment_challenge/master/BI_201805/table_A_conversions.csv'
+B_url='https://raw.githubusercontent.com/haensel-ams/recruitment_challenge/master/BI_201805/table_B_attribution.csv'
+
+# The Extract class is to extract data from your Gihub Repos address
+class Extract():
+
+  def __init__(self,A_url,B_url):
+    print('\033[1m'+'Please, wait! I am extracting data from your Github Repository'+'\033[0m'+'\n...')
+    self.A_url=A_url
+    self.table_A_conversions=self.load_data(self.A_url)
+    self.B_url=B_url
+    self.table_B_attribution=self.load_data(self.B_url)
+    print('Data was successfully extracted!')
+  
+  def load_data(self,url):
+    self.data=pd.read_csv(url)
+    #display(self.data.head(3))
+    return self.data
+
+# The Transform class is to combine two different  extracted datasets and do the data cleansing
+# Also, to know the generanl informantion about KPIs
+class Transform():
+
+  def __init__(self,extract):
+    print('\033[1m'+'I am transforming the extracted data'+'\033[0m'+'\n...')
+    self.table_A_conversions=extract.table_A_conversions
+    self.table_B_attribution=extract.table_B_attribution
+    self.joined_tabs = self.combine_tab(self.table_A_conversions, self.table_B_attribution,'Conv_ID')
+    self.time_tab=self.cleaning_data(self.joined_tabs)
+    # self.infor_Data=self.get_infor(self.time_tab)
+    self.get_missing=self.check_missing(self.time_tab)
+    self.cleaned_tab=self.time_tab.dropna()
+    display(self.cleaned_tab.head(5))
+    self.infor_Data=self.get_infor(self.cleaned_tab)
+    self.more_infor=self.deep_infor(self.cleaned_tab)
+  
+  def deep_infor(self,data):
+    print('Total annual revenue: %d'%data['Revenue'].sum())
+    
+
+  def combine_tab(self,tab_1,tab_2,common_col):
+    print('I am combining two data into one and coverting the time format\n...')
+    self.data=pd.merge(tab_1, tab_2, on=common_col, how='outer')
+    # display(self.data.head(5))
+    return self.data
+
+  def cleaning_data(self,data):
+    data['Conv_Date']= pd.to_datetime(data['Conv_Date']) 
+    self.data=data
+    print('Data was completely transformed!')
+    return self.data
+
+  def get_infor(self,data):
+    print('\033[1m'+'General information:'+'\033[0m')
+    self.information=data.info()
+    print('\033[1m'+'Descriptive Statistics:'+'\033[0m')
+    # print(data.describe())
+    return self.information
+
+  def check_missing(self,data):
+    print('\033[1m'+ 'The number of missing values:'+'\033[0m')
+    self.miss_data=data.isnull().sum()
+    self.miss_rate=100*data.isnull().sum()/len(data)
+    self.mis_infor=pd.concat([self.miss_data, self.miss_rate], axis=1).reset_index()
+    self.mis_infor=self.mis_infor.rename(columns={0: 'Amounts', 1: 'Percentage'})
+    # print(self.mis_infor)
+    return self.miss_data
+
+# The Load class is to load the tranformed data to the database
+class  Load():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am loading the transformed data to my database'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab
+    self.connect=self.connect_database()
+    self.insert=self.insert_data(self.data)
+    
+  def connect_database(self):
+    print('I am trying to connect to my SQL database\n....')
+    self.connect= "%sql sqlite:///phuong_database.db"
+    print(self.connect,'connection is success!',sep='\n')
+    return self.connect
+
+  def insert_data(self,data):
+    print('I am loading the transformed data to my SQL Database\n....')
+    self.check ="%sql DROP TABLE IF EXISTS data"
+    self.insert="%sql PERSIST data"
+    self.list_table="%sql SELECT name FROM sqlite_master WHERE type='table'"
+    print(self.list_table)
+    self.data="%sql SELECT * FROM data LIMIT 3"
+    print(self.data)
+    print('Data was completely inserted into my SQL Database!')
+    return self.insert 
+
+# The EDA_Overview_KPI class is to generate a preliminary overview on the KPI
+class EDA_Overview_KPI():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am doing the Explanatory Data Analysis (EDA) process for Revenue KPIs'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Revenue','User_ID']]
+    self.by_kpi=self.group_data(self.data,'Conv_Date','Revenue','User_ID')
+    # display(self.by_kpi.head(3))
+    self.kpi_fig=self.plot_kpi(self.by_kpi)
+    self.sum_stat=self.get_infor(self.by_kpi,'Conv_Date','Revenue','User_ID')
+      
+    
+  def group_data(self,data,target,exp_1,exp_2):
+    self.num_target=len(data[target].unique())
+    print('The number of '+target+': %d'%self.num_target)
+    self.data=data.groupby([target]).agg({exp_1:'sum',exp_2:'count'})
+    return self.data
+
+  def plot_kpi(self,data):
+    self.name_column=self.data.columns
+    plt.figure(figsize=(15, 9))
+    for i,col in enumerate(self.name_column):
+        plt.subplot(2,1,i+1)
+        plt.plot(self.data[col],label=col)
+        plt.title('The changes in of the daily '+col +' over the time period',fontweight='bold',fontsize='12')
+        plt.legend()
+        plt.autoscale(enable=True, axis='both',tight=True)
+    plt.savefig('Overview_KPI.png')
+    files.download('Overview_KPI.png')
+    return self.name_column
+
+  def get_infor(self,data,target,exp_1,exp_2):
+    self.infor=display(self.data.head(8).T)
+    print('\033[1m'+'Desriptive Statistics of the Daily KPIs by '+ target +'\033[0m', self.data.describe(),sep='\n')
+    print('Date with the highest revenue:', self.data[exp_1].idxmax(axis = 0) )
+    print('Date with the lowest revenue:', self.data[exp_1].idxmin(axis = 0) )
+    print('Date with the highest number of users:', self.data[exp_2].idxmax(axis = 0) )
+    print('Date with the lowest number of users:', self.data[exp_2].idxmin(axis = 0) )
+    return self.infor
+
+# The EDA_KPI_Return class is to generate a preliminary overview on the return customer
+class EDA_KPI_Return():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am doing the Explanatory Data Analysis (EDA) process for User KPIs'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','User_ID']]
+    self.infor_user=self.get_infor(self.data,'User_ID')
+    self.by_user=self.group_data(self.data,'User_ID','Conv_Date')
+    display(self.by_user.head(8).T)
+    self.user_plot=self.plot_user(self.by_user,'Conv_Date')
+
+  def get_infor(self,data,exp):
+    self.num_user=data[exp].unique()
+    print('The number of users: %d'%len(self.num_user))
+    return self.num_user
+
+  def group_data(self,data,target,exp):
+    self.num_target=len(data[target].unique())
+    print('The number of '+target+': %d'%self.num_target)
+    self.data=data.groupby([target]).agg({exp:'count'})
+    # display(self.data.head(8).T)
+    print('\033[1m'+'Desriptive Statistics of the Daily KPIs by '+ target +'\033[0m', self.data.describe(),sep='\n')
+    return self.data
+
+  def plot_user(self,data,exp):
+    self.data=data.rename(columns={exp: 'The number of returns'})
+    self.ax=self.data.plot.hist(figsize=(15, 9),bins=1500,xlim=(1,20),color='#86bf91'
+                                ,title='The Frequence of return customer',grid=True)
+    self.ax.set_xlabel('The number of days')
+    plt.savefig('Customer_return.png')
+    files.download('Customer_return.png') 
+    return self.ax
+
+# The EDA_Static_Ren class is to explore the information about the total revenue per year
+class EDA_Static_Ren():
+  
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Channel','Revenue']]
+    display(self.data.head(3))
+    # self.infor_conver=self.get_infor(self.data)
+    self.by_ChanelRen=self.group_data(self.data,'Channel')
+    self.pie_ChanelRen=self.plot_pie(self.by_ChanelRen,'Revenue')
+
+  def plot_pie(self,data,target):
+    self.data=data
+    self.data['Total Conver'] = self.data.sum(axis=1)
+    self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
+    self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
+    self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
+    plt.savefig('channel_Static_Ren.jpg')
+    files.download('channel_Static_Ren.jpg') 
+    return self.data
+    
+
+  def get_infor(self,data):
+    self.conver_uni=self.data.User_ID.unique()
+    print('The number of conversions: %d'%len(self.conver_uni))
+    return self.conver_uni
+
+  def group_data(self,data,target):
+    print('I am grouping data by '+ target + '\n...')
+    self.data=data.groupby([target]).agg({'Revenue':'sum'})
+    self.data=self.data.T
+    display(self.data)
+    print('I am done! ')
+    return self.data    
+
+# The EDA_Static_User class is to generate information about the total annual number of visits
+class EDA_Static_User():
+  
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Channel','User_ID']] #'Conv_Date',
+    display(self.data.head(3))
+    # self.infor_conver=self.get_infor(self.data)
+    self.by_ChanelConv=self.group_data(self.data,'Channel')
+    self.pie_channelConv=self.plot_pie(self.by_ChanelConv,'User_ID')
+
+  def plot_pie(self,data,target):
+    self.data=data
+    self.data['Total Conver'] = self.data.sum(axis=1)
+    self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
+    self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
+    self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
+    plt.savefig('channel_Static_User.jpg')
+    files.download('channel_Static_User.jpg') 
+    return self.data
+    
+
+  def get_infor(self,data):
+    self.conver_uni=self.data.User_ID.unique()
+    print('The number of conversions: %d'%len(self.conver_uni))
+    return self.conver_uni
+
+  def group_data(self,data,target):
+    print('I am grouping data by '+ target + '\n...')
+    self.data=data.groupby([target]).agg({'User_ID':'count'})
+    self.data=self.data.T
+    display(self.data)
+    print('I am done! ')
+    return self.data    
+
+# The EDA_Static_Conversion is to generate the information about the total annual number of conversion
+class EDA_Static_Conversion():
+  
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Channel','Conv_ID','IHC_Conv']] #'Conv_Date',
+    display(self.data.head(3))
+    self.infor_conver=self.get_infor(self.data)
+    self.by_ChanelConv=self.group_data(self.data,'Channel','Conv_ID')
+    self.pie_channelConv=self.plot_pie(self.by_ChanelConv,'Conv_ID')
+
+
+  def get_infor(self,data):
+    self.conver_uni=self.data.Conv_ID.unique()
+    print('The number of conversions: %d'%len(self.conver_uni))
+    return self.conver_uni
+
+  def group_data(self,data,target,exp):
+    print('I am grouping data by '+ target + '\n...')
+    if data[exp].dtype=='object':
+      self.data=data.groupby([target]).agg({exp:'count'})
+    else:
+      self.data=data.groupby([target]).agg({exp:'sum'})
+    self.data=self.data.T
+    display(self.data)
+    print('I am done! ')
+    return self.data    
+
+  def plot_pie(self,data,target):
+    self.data=data
+    self.data['Total Conver'] = self.data.sum(axis=1)
+    self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
+    self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
+    self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
+    plt.savefig('channel_Conver.png')
+    files.download('channel_Conver.png') 
+    return self.data
+
+# The EDA_Channel_Revenue class is to analyze the impacts of the online marketing channels on 
+# the daily Revenue
+class EDA_Channel_Revenue():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am analyzing the influences of the online marketing channels on the daily revenue'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Channel','Revenue']]
+    self.by_DateChannel=self.group_data(self.data,'Conv_Date','Channel')
+    self.unstaked_data=self.unstack_data(self.by_DateChannel,'Revenue','bar')
+    self.plotted_data=self.plot_data(self.unstaked_data)   
+    self.exported_data=self.export_data(self.unstaked_data,'channel_revenue')
+
+  def group_data(self,data,target_1,target_2):
+    print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
+    self.data=data.groupby([target_1,target_2])#.agg({exp:'count'})
+    print('I am done! ')
+    return self.data    
+
+  def unstack_data(self,data,exp,kind):
+    print('I am unstacking data \n...')
+    data=data.sum()[exp].unstack(level=-1)
+    self.data=data
+    display(self.data.head(3))
+    print('Data were unstacked completely\n...')
+    return self.data
+
+  def plot_data(self,data):
+    self.data=data
+    print('I am visualizing the contribution of Top 5 Channels to the Daily Revenue\n...')
+    self.data['The Total'] = self.data.sum(axis=1)
+    self.data['The Rest']= self.data['The Total']-self.data[['A','G','H','I','B']].sum(axis=1)
+    self.xlim=('2017-03-01','2018-03-24')
+    self.ax =self.data[['A','G','H','I','B','The Rest']].plot.area(xlim=self.xlim, figsize=(12,8))
+    self.ax.set_xlabel('Date')
+    self.ax.set_ylabel('Revenue')
+    print(self.data['The Rest'].describe())
+    plt.savefig('channel_ren.png')
+    files.download('channel_ren.png') 
+    return self.data
+    
+  def export_data(self,data,title):
+    print('I am exporting data to the excel and csv files\n...')
+    data.to_excel(title+'.xlsx')
+    self.excel=files.download(title+'.xlsx')
+    data.to_csv(title+'.csv')
+    self.csv=files.download(title+'.csv')
+    return self.excel
+
+# The EDA_Channel_User class is to analyze the impacts of the online marketing channels on 
+# the daily number of users
+class EDA_Channel_User():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am analyzing the influences of the online marketing channels on the daily number of users'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Channel','User_ID']]
+    self.by_DateUser=self.group_data(self.data,'Conv_Date','Channel','User_ID')
+    self.unstaked_data=self.unstack_data(self.by_DateUser,'User_ID','bar')
+    #display(self.unstaked_data.head(3))
+    self.plotted_data=self.plot_data(self.unstaked_data)   
+    # self.exported_data=self.export_data(self.unstaked_data,'channel_num_user')
+
+  def group_data(self,data,target_1,target_2,exp):
+    print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
+    self.data=data.groupby([target_1,target_2])#.agg({exp:'count'})
+    print('I am done! ')
+    return self.data    
+
+  def unstack_data(self,data,exp,kind):
+    print('I am unstacking data \n...')
+    data=data.count()[exp].unstack(level=-1)
+    self.data=data
+    print('Data were unstacked completely\n...')
+    return self.data
+
+  def plot_data(self,data):
+    self.data=data
+    print('I am visualizing the contribution of Top 5 Channels to the Daily Number of Users\n...')
+    self.data['The Total'] = self.data.sum(axis=1)
+    self.data['The Rest'] = self.data['The Total'] - self.data[['A','G','H','I','B']].sum(axis=1)
+    self.xlim=('2017-03-01','2018-03-24')
+    self.ax =self.data[['A','G','H','I','B','The Rest']].plot.area(xlim=self.xlim, figsize=(12,8))
+    self.ax.set_xlabel('Date')
+    self.ax.set_ylabel('The number of Users')
+    plt.savefig('channel_user.png')
+    files.download('channel_user.png') 
+    return self.data
+    
+  def export_data(self,data,title):
+    print('I am exporting data to the excel and csv files\n...')
+    data.to_excel(title+'.xlsx')
+    self.excel=files.download(title+'.xlsx')
+    data.to_csv(title+'.csv')
+    self.csv=files.download(title+'.csv')
+    return self.excel
+
+# The EDA_channel_IHC class is to generate the changes in the daily IHC of Channels
+class EDA_channel_IHC():
+
+  def __init__(self,transform):
+    print('\033[1m'+'I am  doing the EDA on Conversion'+'\033[0m'+'\n...')
+    self.data=transform.cleaned_tab[['Conv_Date','Channel','IHC_Conv']] #'Conv_Date',
+    self.by_TimeChannel=self.group_data(self.data,'Conv_Date','Channel','IHC_Conv')
+    self.unstacked_data=self.unstack_data(self.by_TimeChannel,'IHC_Conv')
+    self.change_plot=self.plot_data(self.unstacked_data)
+
+  def plot_data(self,data):
+    self.data=data
+    # self.data['The Rest'] = self.data.sum(axis=1)
+    self.xlim=('2017-03-01','2018-03-24')
+    self.ylim=('0','550')
+    self.ax =self.data[['A','G','H','I','B']].plot.line(xlim=self.xlim,figsize=(12,8))
+    self.ax.set_xlabel('Date')
+    self.ax.set_ylabel('IHC_Conv')
+    plt.savefig('channel_IHC.png')
+    files.download('channel_IHC.png') 
+    return self.data
+
+  def group_data(self,data,target_1,target_2,exp):
+    print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
+    self.data=data.groupby([target_1,target_2])#.agg({exp:'sum'})
+    print('I am done! ')
+    return self.data    
+
+  def unstack_data(self,data,exp):
+    print('I am unstacking data \n...')
+    data=data.sum()[exp].unstack(level=-1)
+    self.data=data
+    print('Data were unstacked completely\n...')
+    return self.data
+
+
+class main():
+  Extract=Extract(A_url,B_url)
+  Transform=Transform(Extract)
+  Load=Load(Transform)
+  EDA_Overview_KPI=EDA_Overview_KPI(Transform)
+  EDA_Static_Ren=EDA_Static_Ren(Transform)
+  EDA_KPI_Return=EDA_KPI_Return(Transform)
+  EDA_Static_User=EDA_Static_User(Transform)
+  EDA_Static_Conversion=EDA_Static_Conversion(Transform)
+  EDA_Channel_Revenue=EDA_Channel_Revenue(Transform)
+  EDA_Channel_User=EDA_Channel_User(Transform)
+  EDA_channel_IHC=EDA_channel_IHC(Transform)
+ 
+
+if __name__=='__main__':
+  main()

+ 727 - 0
data/purposeCombined/BI/cube-backup.py

@@ -0,0 +1,727 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long,unused-argument,ungrouped-imports
+"""A collection of ORM sqlalchemy models for Superset"""
+import json
+import logging
+import textwrap
+from contextlib import closing
+from copy import deepcopy
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
+
+import numpy
+import pandas as pd
+import sqlalchemy as sqla
+import sqlparse
+from flask import g, request
+from flask_appbuilder import Model
+from sqlalchemy import (
+    Boolean,
+    Column,
+    create_engine,
+    DateTime,
+    ForeignKey,
+    Integer,
+    MetaData,
+    String,
+    Table,
+    Text,
+)
+from sqlalchemy.engine import Dialect, Engine, url
+from sqlalchemy.engine.reflection import Inspector
+from sqlalchemy.engine.url import make_url, URL
+from sqlalchemy.ext.hybrid import hybrid_property
+from sqlalchemy.orm import relationship
+from sqlalchemy.pool import NullPool
+from sqlalchemy.schema import UniqueConstraint
+from sqlalchemy.sql import expression, Select
+from sqlalchemy_utils import EncryptedType
+
+from superset import app, db_engine_specs, is_feature_enabled, security_manager
+from superset.db_engine_specs.base import TimeGrain
+from superset.models.dashboard import Dashboard
+from superset.models.helpers import AuditMixinNullable, ImportMixin
+from superset.models.tags import DashboardUpdater, FavStarUpdater
+from superset.utils import cache as cache_util, core as utils
+
+config = app.config
+custom_password_store = config["SQLALCHEMY_CUSTOM_PASSWORD_STORE"]
+stats_logger = config["STATS_LOGGER"]
+log_query = config["QUERY_LOGGER"]
+metadata = Model.metadata  # pylint: disable=no-member
+logger = logging.getLogger(__name__)
+
+PASSWORD_MASK = "X" * 10
+DB_CONNECTION_MUTATOR = config["DB_CONNECTION_MUTATOR"]
+
+
+class Url(Model, AuditMixinNullable):
+    """Used for the short url feature"""
+
+    __tablename__ = "url"
+    id = Column(Integer, primary_key=True)
+    url = Column(Text)
+
+
+class KeyValue(Model):  # pylint: disable=too-few-public-methods
+
+    """Used for any type of key-value store"""
+
+    __tablename__ = "keyvalue"
+    id = Column(Integer, primary_key=True)
+    value = Column(Text, nullable=False)
+
+
+class CssTemplate(Model, AuditMixinNullable):
+
+    """CSS templates for dashboards"""
+
+    __tablename__ = "css_templates"
+    id = Column(Integer, primary_key=True)
+    template_name = Column(String(250))
+    css = Column(Text, default="")
+
+
+class Database(
+    Model, AuditMixinNullable, ImportMixin
+):  # pylint: disable=too-many-public-methods
+
+    """An ORM object that stores Database related information"""
+
+    __tablename__ = "dbs"
+    type = "table"
+    __table_args__ = (UniqueConstraint("database_name"),)
+
+    id = Column(Integer, primary_key=True)
+    verbose_name = Column(String(250), unique=True)
+    # short unique name, used in permissions
+    database_name = Column(String(250), unique=True, nullable=False)
+    sqlalchemy_uri = Column(String(1024), nullable=False)
+    password = Column(EncryptedType(String(1024), config["SECRET_KEY"]))
+    cache_timeout = Column(Integer)
+    select_as_create_table_as = Column(Boolean, default=False)
+    expose_in_sqllab = Column(Boolean, default=True)
+    allow_run_async = Column(Boolean, default=False)
+    allow_csv_upload = Column(Boolean, default=False)
+    allow_ctas = Column(Boolean, default=False)
+    allow_cvas = Column(Boolean, default=False)
+    allow_dml = Column(Boolean, default=False)
+    force_ctas_schema = Column(String(250))
+    allow_multi_schema_metadata_fetch = Column(  # pylint: disable=invalid-name
+        Boolean, default=False
+    )
+    extra = Column(
+        Text,
+        default=textwrap.dedent(
+            """\
+    {
+        "metadata_params": {},
+        "engine_params": {},
+        "metadata_cache_timeout": {},
+        "schemas_allowed_for_csv_upload": []
+    }
+    """
+        ),
+    )
+    encrypted_extra = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    impersonate_user = Column(Boolean, default=False)
+    server_cert = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    export_fields = [
+        "database_name",
+        "sqlalchemy_uri",
+        "cache_timeout",
+        "expose_in_sqllab",
+        "allow_run_async",
+        "allow_ctas",
+        "allow_cvas",
+        "allow_csv_upload",
+        "extra",
+    ]
+    export_children = ["tables"]
+
+    def __repr__(self) -> str:
+        return self.name
+
+    @property
+    def name(self) -> str:
+        return self.verbose_name if self.verbose_name else self.database_name
+
+    @property
+    def allows_subquery(self) -> bool:
+        return self.db_engine_spec.allows_subqueries
+
+    @property
+    def function_names(self) -> List[str]:
+        try:
+            return self.db_engine_spec.get_function_names(self)
+        except Exception as ex:  # pylint: disable=broad-except
+            # function_names property is used in bulk APIs and should not hard crash
+            # more info in: https://github.com/apache/incubator-superset/issues/9678
+            logger.error(
+                "Failed to fetch database function names with error: %s", str(ex)
+            )
+        return []
+
+    @property
+    def allows_cost_estimate(self) -> bool:
+        extra = self.get_extra()
+
+        database_version = extra.get("version")
+        cost_estimate_enabled: bool = extra.get("cost_estimate_enabled")  # type: ignore
+
+        return (
+            self.db_engine_spec.get_allow_cost_estimate(database_version)
+            and cost_estimate_enabled
+        )
+
+    @property
+    def allows_virtual_table_explore(self) -> bool:
+        extra = self.get_extra()
+
+        return bool(extra.get("allows_virtual_table_explore", True))
+
+    @property
+    def explore_database_id(self) -> int:
+        return self.get_extra().get("explore_database_id", self.id)
+
+    @property
+    def data(self) -> Dict[str, Any]:
+        return {
+            "id": self.id,
+            "name": self.database_name,
+            "backend": self.backend,
+            "allow_multi_schema_metadata_fetch": self.allow_multi_schema_metadata_fetch,
+            "allows_subquery": self.allows_subquery,
+            "allows_cost_estimate": self.allows_cost_estimate,
+            "allows_virtual_table_explore": self.allows_virtual_table_explore,
+            "explore_database_id": self.explore_database_id,
+        }
+
+    @property
+    def unique_name(self) -> str:
+        return self.database_name
+
+    @property
+    def url_object(self) -> URL:
+        return make_url(self.sqlalchemy_uri_decrypted)
+
+    @property
+    def backend(self) -> str:
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        return sqlalchemy_url.get_backend_name()  # pylint: disable=no-member
+
+    @property
+    def metadata_cache_timeout(self) -> Dict[str, Any]:
+        return self.get_extra().get("metadata_cache_timeout", {})
+
+    @property
+    def schema_cache_enabled(self) -> bool:
+        return "schema_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def schema_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("schema_cache_timeout")
+
+    @property
+    def table_cache_enabled(self) -> bool:
+        return "table_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def table_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("table_cache_timeout")
+
+    @property
+    def default_schemas(self) -> List[str]:
+        return self.get_extra().get("default_schemas", [])
+
+    @property
+    def connect_args(self) -> Dict[str, Any]:
+        return self.get_extra().get("engine_params", {}).get("connect_args", {})
+
+    @classmethod
+    def get_password_masked_url_from_uri(  # pylint: disable=invalid-name
+        cls, uri: str
+    ) -> URL:
+        sqlalchemy_url = make_url(uri)
+        return cls.get_password_masked_url(sqlalchemy_url)
+
+    @classmethod
+    def get_password_masked_url(
+        cls, url: URL  # pylint: disable=redefined-outer-name
+    ) -> URL:
+        url_copy = deepcopy(url)
+        if url_copy.password is not None:
+            url_copy.password = PASSWORD_MASK
+        return url_copy
+
+    def set_sqlalchemy_uri(self, uri: str) -> None:
+        conn = sqla.engine.url.make_url(uri.strip())
+        if conn.password != PASSWORD_MASK and not custom_password_store:
+            # do not over-write the password with the password mask
+            self.password = conn.password
+        conn.password = PASSWORD_MASK if conn.password else None
+        self.sqlalchemy_uri = str(conn)  # hides the password
+
+    def get_effective_user(
+        self,
+        url: URL,  # pylint: disable=redefined-outer-name
+        user_name: Optional[str] = None,
+    ) -> Optional[str]:
+        """
+        Get the effective user, especially during impersonation.
+        :param url: SQL Alchemy URL object
+        :param user_name: Default username
+        :return: The effective username
+        """
+        effective_username = None
+        if self.impersonate_user:
+            effective_username = url.username
+            if user_name:
+                effective_username = user_name
+            elif (
+                hasattr(g, "user")
+                and hasattr(g.user, "username")
+                and g.user.username is not None
+            ):
+                effective_username = g.user.username
+        return effective_username
+
+    @utils.memoized(watch=("impersonate_user", "sqlalchemy_uri_decrypted", "extra"))
+    def get_sqla_engine(
+        self,
+        schema: Optional[str] = None,
+        nullpool: bool = True,
+        user_name: Optional[str] = None,
+        source: Optional[utils.QuerySource] = None,
+    ) -> Engine:
+        extra = self.get_extra()
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        self.db_engine_spec.adjust_database_uri(sqlalchemy_url, schema)
+        effective_username = self.get_effective_user(sqlalchemy_url, user_name)
+        # If using MySQL or Presto for example, will set url.username
+        # If using Hive, will not do anything yet since that relies on a
+        # configuration parameter instead.
+        self.db_engine_spec.modify_url_for_impersonation(
+            sqlalchemy_url, self.impersonate_user, effective_username
+        )
+
+        masked_url = self.get_password_masked_url(sqlalchemy_url)
+        logger.debug("Database.get_sqla_engine(). Masked URL: %s", str(masked_url))
+
+        params = extra.get("engine_params", {})
+        if nullpool:
+            params["poolclass"] = NullPool
+
+        connect_args = params.get("connect_args", {})
+        configuration = connect_args.get("configuration", {})
+
+        # If using Hive, this will set hive.server2.proxy.user=$effective_username
+        configuration.update(
+            self.db_engine_spec.get_configuration_for_impersonation(
+                str(sqlalchemy_url), self.impersonate_user, effective_username
+            )
+        )
+        if configuration:
+            connect_args["configuration"] = configuration
+        if connect_args:
+            params["connect_args"] = connect_args
+
+        params.update(self.get_encrypted_extra())
+
+        if DB_CONNECTION_MUTATOR:
+            if not source and request and request.referrer:
+                if "/superset/dashboard/" in request.referrer:
+                    source = utils.QuerySource.DASHBOARD
+                elif "/superset/explore/" in request.referrer:
+                    source = utils.QuerySource.CHART
+                elif "/superset/sqllab/" in request.referrer:
+                    source = utils.QuerySource.SQL_LAB
+
+            sqlalchemy_url, params = DB_CONNECTION_MUTATOR(
+                sqlalchemy_url, params, effective_username, security_manager, source
+            )
+
+        return create_engine(sqlalchemy_url, **params)
+
+    def get_reserved_words(self) -> Set[str]:
+        return self.get_dialect().preparer.reserved_words
+
+    def get_quoter(self) -> Callable[[str, Any], str]:
+        return self.get_dialect().identifier_preparer.quote
+
+    def get_df(  # pylint: disable=too-many-locals
+        self,
+        sql: str,
+        schema: Optional[str] = None,
+        mutator: Optional[Callable[[pd.DataFrame], None]] = None,
+    ) -> pd.DataFrame:
+        sqls = [str(s).strip(" ;") for s in sqlparse.parse(sql)]
+
+        engine = self.get_sqla_engine(schema=schema)
+        username = utils.get_username()
+
+        def needs_conversion(df_series: pd.Series) -> bool:
+            return not df_series.empty and isinstance(df_series[0], (list, dict))
+
+        def _log_query(sql: str) -> None:
+            if log_query:
+                log_query(engine.url, sql, schema, username, __name__, security_manager)
+
+        with closing(engine.raw_connection()) as conn:
+            with closing(conn.cursor()) as cursor:
+                for sql_ in sqls[:-1]:
+                    _log_query(sql_)
+                    self.db_engine_spec.execute(cursor, sql_)
+                    cursor.fetchall()
+
+                _log_query(sqls[-1])
+                self.db_engine_spec.execute(cursor, sqls[-1])
+
+                if cursor.description is not None:
+                    columns = [col_desc[0] for col_desc in cursor.description]
+                else:
+                    columns = []
+
+                df = pd.DataFrame.from_records(
+                    data=list(cursor.fetchall()), columns=columns, coerce_float=True
+                )
+
+                if mutator:
+                    mutator(df)
+
+                for k, v in df.dtypes.items():
+                    if v.type == numpy.object_ and needs_conversion(df[k]):
+                        df[k] = df[k].apply(utils.json_dumps_w_dates)
+                return df
+
+    def compile_sqla_query(self, qry: Select, schema: Optional[str] = None) -> str:
+        engine = self.get_sqla_engine(schema=schema)
+
+        sql = str(qry.compile(engine, compile_kwargs={"literal_binds": True}))
+
+        if (
+            engine.dialect.identifier_preparer._double_percents  # pylint: disable=protected-access
+        ):
+            sql = sql.replace("%%", "%")
+
+        return sql
+
+    def select_star(  # pylint: disable=too-many-arguments
+        self,
+        table_name: str,
+        schema: Optional[str] = None,
+        limit: int = 100,
+        show_cols: bool = False,
+        indent: bool = True,
+        latest_partition: bool = False,
+        cols: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """Generates a ``select *`` statement in the proper dialect"""
+        eng = self.get_sqla_engine(schema=schema, source=utils.QuerySource.SQL_LAB)
+        return self.db_engine_spec.select_star(
+            self,
+            table_name,
+            schema=schema,
+            engine=eng,
+            limit=limit,
+            show_cols=show_cols,
+            indent=indent,
+            latest_partition=latest_partition,
+            cols=cols,
+        )
+
+    def apply_limit_to_sql(self, sql: str, limit: int = 1000) -> str:
+        return self.db_engine_spec.apply_limit_to_sql(sql, limit, self)
+
+    def safe_sqlalchemy_uri(self) -> str:
+        return self.sqlalchemy_uri
+
+    @property
+    def inspector(self) -> Inspector:
+        engine = self.get_sqla_engine()
+        return sqla.inspect(engine)
+
+    @cache_util.memoized_func(
+        key= "db:{}:schema:None:table_list",
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "table")
+
+    @cache_util.memoized_func(
+        key= "db:{}:schema:None:view_list", attribute_in_key="id"
+    )
+    def get_all_view_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "view")
+
+    @cache_util.memoized_func(
+        key= f"db:{{}}:schema:{kwargs.get('schema')}:table_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of tables
+        """
+        try:
+            tables = self.db_engine_spec.get_table_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [
+                utils.DatasourceName(table=table, schema=schema) for table in tables
+            ]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key= f"db:{{}}:schema:{kwargs.get('schema')}:view_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_view_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of views
+        """
+        try:
+            views = self.db_engine_spec.get_view_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [utils.DatasourceName(table=view, schema=schema) for view in views]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key= "db:{}:schema_list", attribute_in_key="id"
+    )
+    def get_all_schema_names(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[str]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: schema list
+        """
+        return self.db_engine_spec.get_schema_names(self.inspector)
+
+    @property
+    def db_engine_spec(self) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(self.backend, db_engine_specs.BaseEngineSpec)
+
+    @classmethod
+    def get_db_engine_spec_for_backend(
+        cls, backend: str
+    ) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(backend, db_engine_specs.BaseEngineSpec)
+
+    def grains(self) -> Tuple[TimeGrain, ...]:
+        """Defines time granularity database-specific expressions.
+
+        The idea here is to make it easy for users to change the time grain
+        from a datetime (maybe the source grain is arbitrary timestamps, daily
+        or 5 minutes increments) to another, "truncated" datetime. Since
+        each database has slightly different but similar datetime functions,
+        this allows a mapping between database engines and actual functions.
+        """
+        return self.db_engine_spec.get_time_grains()
+
+    def get_extra(self) -> Dict[str, Any]:
+        return self.db_engine_spec.get_extra_params(self)
+
+    def get_encrypted_extra(self) -> Dict[str, Any]:
+        encrypted_extra = {}
+        if self.encrypted_extra:
+            try:
+                encrypted_extra = json.loads(self.encrypted_extra)
+            except json.JSONDecodeError as ex:
+                logger.error(ex)
+                raise ex
+        return encrypted_extra
+
+    def get_table(self, table_name: str, schema: Optional[str] = None) -> Table:
+        extra = self.get_extra()
+        meta = MetaData(**extra.get("metadata_params", {}))
+        return Table(
+            table_name,
+            meta,
+            schema=schema or None,
+            autoload=True,
+            autoload_with=self.get_sqla_engine(),
+        )
+
+    def get_columns(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.db_engine_spec.get_columns(self.inspector, table_name, schema)
+
+    def get_indexes(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_indexes(table_name, schema)
+
+    def get_pk_constraint(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> Dict[str, Any]:
+        return self.inspector.get_pk_constraint(table_name, schema)
+
+    def get_foreign_keys(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_foreign_keys(table_name, schema)
+
+    def get_schema_access_for_csv_upload(  # pylint: disable=invalid-name
+        self,
+    ) -> List[str]:
+        allowed_databases = self.get_extra().get("schemas_allowed_for_csv_upload", [])
+        if hasattr(g, "user"):
+            extra_allowed_databases = config["ALLOWED_USER_CSV_SCHEMA_FUNC"](
+                self, g.user
+            )
+            allowed_databases += extra_allowed_databases
+        return sorted(set(allowed_databases))
+
+    @property
+    def sqlalchemy_uri_decrypted(self) -> str:
+        conn = sqla.engine.url.make_url(self.sqlalchemy_uri)
+        if custom_password_store:
+            conn.password = custom_password_store(conn)
+        else:
+            conn.password = self.password
+        return str(conn)
+
+    @property
+    def sql_url(self) -> str:
+        return f"/superset/sql/{self.id}/"
+
+    @hybrid_property
+    def perm(self) -> str:
+        return f"[{self.database_name}].(id:{self.id})"
+
+    @perm.expression  # type: ignore
+    def perm(cls) -> str:  # pylint: disable=no-self-argument
+        return (
+            "[" + cls.database_name + "].(id:" + expression.cast(cls.id, String) + ")"
+        )
+
+    def get_perm(self) -> str:
+        return self.perm  # type: ignore
+
+    def has_table(self, table: Table) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table.table_name, table.schema or None)
+
+    def has_table_by_name(self, table_name: str, schema: Optional[str] = None) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table_name, schema)
+
+    @utils.memoized
+    def get_dialect(self) -> Dialect:
+        sqla_url = url.make_url(self.sqlalchemy_uri_decrypted)
+        return sqla_url.get_dialect()()  # pylint: disable=no-member
+
+
+sqla.event.listen(Database, "after_insert", security_manager.set_perm)
+sqla.event.listen(Database, "after_update", security_manager.set_perm)
+
+
+class Log(Model):  # pylint: disable=too-few-public-methods
+
+    """ORM object used to log Superset actions to the database"""
+
+    __tablename__ = "logs"
+
+    id = Column(Integer, primary_key=True)
+    action = Column(String(512))
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    dashboard_id = Column(Integer)
+    slice_id = Column(Integer)
+    json = Column(Text)
+    user = relationship(
+        security_manager.user_model, backref="logs", foreign_keys=[user_id]
+    )
+    dttm = Column(DateTime, default=datetime.utcnow)
+    duration_ms = Column(Integer)
+    referrer = Column(String(1024))
+
+
+class FavStar(Model):  # pylint: disable=too-few-public-methods
+    __tablename__ = "favstar"
+
+    id = Column(Integer, primary_key=True)
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    class_name = Column(String(50))
+    obj_id = Column(Integer)
+    dttm = Column(DateTime, default=datetime.utcnow)
+
+
+# events for updating tags
+if is_feature_enabled("TAGGING_SYSTEM"):
+    sqla.event.listen(Dashboard, "after_insert", DashboardUpdater.after_insert)
+    sqla.event.listen(Dashboard, "after_update", DashboardUpdater.after_update)
+    sqla.event.listen(Dashboard, "after_delete", DashboardUpdater.after_delete)
+    sqla.event.listen(FavStar, "after_insert", FavStarUpdater.after_insert)
+    sqla.event.listen(FavStar, "after_delete", FavStarUpdater.after_delete)

+ 727 - 0
data/purposeCombined/BI/cube.py

@@ -0,0 +1,727 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long,unused-argument,ungrouped-imports
+"""A collection of ORM sqlalchemy models for Superset"""
+import json
+import logging
+import textwrap
+from contextlib import closing
+from copy import deepcopy
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
+
+import numpy
+import pandas as pd
+import sqlalchemy as sqla
+import sqlparse
+from flask import g, request
+from flask_appbuilder import Model
+from sqlalchemy import (
+    Boolean,
+    Column,
+    create_engine,
+    DateTime,
+    ForeignKey,
+    Integer,
+    MetaData,
+    String,
+    Table,
+    Text,
+)
+from sqlalchemy.engine import Dialect, Engine, url
+from sqlalchemy.engine.reflection import Inspector
+from sqlalchemy.engine.url import make_url, URL
+from sqlalchemy.ext.hybrid import hybrid_property
+from sqlalchemy.orm import relationship
+from sqlalchemy.pool import NullPool
+from sqlalchemy.schema import UniqueConstraint
+from sqlalchemy.sql import expression, Select
+from sqlalchemy_utils import EncryptedType
+
+from superset import app, db_engine_specs, is_feature_enabled, security_manager
+from superset.db_engine_specs.base import TimeGrain
+from superset.models.dashboard import Dashboard
+from superset.models.helpers import AuditMixinNullable, ImportMixin
+from superset.models.tags import DashboardUpdater, FavStarUpdater
+from superset.utils import cache as cache_util, core as utils
+
+config = app.config
+custom_password_store = config["SQLALCHEMY_CUSTOM_PASSWORD_STORE"]
+stats_logger = config["STATS_LOGGER"]
+log_query = config["QUERY_LOGGER"]
+metadata = Model.metadata  # pylint: disable=no-member
+logger = logging.getLogger(__name__)
+
+PASSWORD_MASK = "X" * 10
+DB_CONNECTION_MUTATOR = config["DB_CONNECTION_MUTATOR"]
+
+
+class Url(Model, AuditMixinNullable):
+    """Used for the short url feature"""
+
+    __tablename__ = "url"
+    id = Column(Integer, primary_key=True)
+    url = Column(Text)
+
+
+class KeyValue(Model):  # pylint: disable=too-few-public-methods
+
+    """Used for any type of key-value store"""
+
+    __tablename__ = "keyvalue"
+    id = Column(Integer, primary_key=True)
+    value = Column(Text, nullable=False)
+
+
+class CssTemplate(Model, AuditMixinNullable):
+
+    """CSS templates for dashboards"""
+
+    __tablename__ = "css_templates"
+    id = Column(Integer, primary_key=True)
+    template_name = Column(String(250))
+    css = Column(Text, default="")
+
+
+class Database(
+    Model, AuditMixinNullable, ImportMixin
+):  # pylint: disable=too-many-public-methods
+
+    """An ORM object that stores Database related information"""
+
+    __tablename__ = "dbs"
+    type = "table"
+    __table_args__ = (UniqueConstraint("database_name"),)
+
+    id = Column(Integer, primary_key=True)
+    verbose_name = Column(String(250), unique=True)
+    # short unique name, used in permissions
+    database_name = Column(String(250), unique=True, nullable=False)
+    sqlalchemy_uri = Column(String(1024), nullable=False)
+    password = Column(EncryptedType(String(1024), config["SECRET_KEY"]))
+    cache_timeout = Column(Integer)
+    select_as_create_table_as = Column(Boolean, default=False)
+    expose_in_sqllab = Column(Boolean, default=True)
+    allow_run_async = Column(Boolean, default=False)
+    allow_csv_upload = Column(Boolean, default=False)
+    allow_ctas = Column(Boolean, default=False)
+    allow_cvas = Column(Boolean, default=False)
+    allow_dml = Column(Boolean, default=False)
+    force_ctas_schema = Column(String(250))
+    allow_multi_schema_metadata_fetch = Column(  # pylint: disable=invalid-name
+        Boolean, default=False
+    )
+    extra = Column(
+        Text,
+        default=textwrap.dedent(
+            """\
+    {
+        "metadata_params": {},
+        "engine_params": {},
+        "metadata_cache_timeout": {},
+        "schemas_allowed_for_csv_upload": []
+    }
+    """
+        ),
+    )
+    encrypted_extra = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    impersonate_user = Column(Boolean, default=False)
+    server_cert = Column(EncryptedType(Text, config["SECRET_KEY"]), nullable=True)
+    export_fields = [
+        "database_name",
+        "sqlalchemy_uri",
+        "cache_timeout",
+        "expose_in_sqllab",
+        "allow_run_async",
+        "allow_ctas",
+        "allow_cvas",
+        "allow_csv_upload",
+        "extra",
+    ]
+    export_children = ["tables"]
+
+    def __repr__(self) -> str:
+        return self.name
+
+    @property
+    def name(self) -> str:
+        return self.verbose_name if self.verbose_name else self.database_name
+
+    @property
+    def allows_subquery(self) -> bool:
+        return self.db_engine_spec.allows_subqueries
+
+    @property
+    def function_names(self) -> List[str]:
+        try:
+            return self.db_engine_spec.get_function_names(self)
+        except Exception as ex:  # pylint: disable=broad-except
+            # function_names property is used in bulk APIs and should not hard crash
+            # more info in: https://github.com/apache/incubator-superset/issues/9678
+            logger.error(
+                "Failed to fetch database function names with error: %s", str(ex)
+            )
+        return []
+
+    @property
+    def allows_cost_estimate(self) -> bool:
+        extra = self.get_extra()
+
+        database_version = extra.get("version")
+        cost_estimate_enabled: bool = extra.get("cost_estimate_enabled")  # type: ignore
+
+        return (
+            self.db_engine_spec.get_allow_cost_estimate(database_version)
+            and cost_estimate_enabled
+        )
+
+    @property
+    def allows_virtual_table_explore(self) -> bool:
+        extra = self.get_extra()
+
+        return bool(extra.get("allows_virtual_table_explore", True))
+
+    @property
+    def explore_database_id(self) -> int:
+        return self.get_extra().get("explore_database_id", self.id)
+
+    @property
+    def data(self) -> Dict[str, Any]:
+        return {
+            "id": self.id,
+            "name": self.database_name,
+            "backend": self.backend,
+            "allow_multi_schema_metadata_fetch": self.allow_multi_schema_metadata_fetch,
+            "allows_subquery": self.allows_subquery,
+            "allows_cost_estimate": self.allows_cost_estimate,
+            "allows_virtual_table_explore": self.allows_virtual_table_explore,
+            "explore_database_id": self.explore_database_id,
+        }
+
+    @property
+    def unique_name(self) -> str:
+        return self.database_name
+
+    @property
+    def url_object(self) -> URL:
+        return make_url(self.sqlalchemy_uri_decrypted)
+
+    @property
+    def backend(self) -> str:
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        return sqlalchemy_url.get_backend_name()  # pylint: disable=no-member
+
+    @property
+    def metadata_cache_timeout(self) -> Dict[str, Any]:
+        return self.get_extra().get("metadata_cache_timeout", {})
+
+    @property
+    def schema_cache_enabled(self) -> bool:
+        return "schema_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def schema_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("schema_cache_timeout")
+
+    @property
+    def table_cache_enabled(self) -> bool:
+        return "table_cache_timeout" in self.metadata_cache_timeout
+
+    @property
+    def table_cache_timeout(self) -> Optional[int]:
+        return self.metadata_cache_timeout.get("table_cache_timeout")
+
+    @property
+    def default_schemas(self) -> List[str]:
+        return self.get_extra().get("default_schemas", [])
+
+    @property
+    def connect_args(self) -> Dict[str, Any]:
+        return self.get_extra().get("engine_params", {}).get("connect_args", {})
+
+    @classmethod
+    def get_password_masked_url_from_uri(  # pylint: disable=invalid-name
+        cls, uri: str
+    ) -> URL:
+        sqlalchemy_url = make_url(uri)
+        return cls.get_password_masked_url(sqlalchemy_url)
+
+    @classmethod
+    def get_password_masked_url(
+        cls, url: URL  # pylint: disable=redefined-outer-name
+    ) -> URL:
+        url_copy = deepcopy(url)
+        if url_copy.password is not None:
+            url_copy.password = PASSWORD_MASK
+        return url_copy
+
+    def set_sqlalchemy_uri(self, uri: str) -> None:
+        conn = sqla.engine.url.make_url(uri.strip())
+        if conn.password != PASSWORD_MASK and not custom_password_store:
+            # do not over-write the password with the password mask
+            self.password = conn.password
+        conn.password = PASSWORD_MASK if conn.password else None
+        self.sqlalchemy_uri = str(conn)  # hides the password
+
+    def get_effective_user(
+        self,
+        url: URL,  # pylint: disable=redefined-outer-name
+        user_name: Optional[str] = None,
+    ) -> Optional[str]:
+        """
+        Get the effective user, especially during impersonation.
+        :param url: SQL Alchemy URL object
+        :param user_name: Default username
+        :return: The effective username
+        """
+        effective_username = None
+        if self.impersonate_user:
+            effective_username = url.username
+            if user_name:
+                effective_username = user_name
+            elif (
+                hasattr(g, "user")
+                and hasattr(g.user, "username")
+                and g.user.username is not None
+            ):
+                effective_username = g.user.username
+        return effective_username
+
+    @utils.memoized(watch=("impersonate_user", "sqlalchemy_uri_decrypted", "extra"))
+    def get_sqla_engine(
+        self,
+        schema: Optional[str] = None,
+        nullpool: bool = True,
+        user_name: Optional[str] = None,
+        source: Optional[utils.QuerySource] = None,
+    ) -> Engine:
+        extra = self.get_extra()
+        sqlalchemy_url = make_url(self.sqlalchemy_uri_decrypted)
+        self.db_engine_spec.adjust_database_uri(sqlalchemy_url, schema)
+        effective_username = self.get_effective_user(sqlalchemy_url, user_name)
+        # If using MySQL or Presto for example, will set url.username
+        # If using Hive, will not do anything yet since that relies on a
+        # configuration parameter instead.
+        self.db_engine_spec.modify_url_for_impersonation(
+            sqlalchemy_url, self.impersonate_user, effective_username
+        )
+
+        masked_url = self.get_password_masked_url(sqlalchemy_url)
+        logger.debug("Database.get_sqla_engine(). Masked URL: %s", str(masked_url))
+
+        params = extra.get("engine_params", {})
+        if nullpool:
+            params["poolclass"] = NullPool
+
+        connect_args = params.get("connect_args", {})
+        configuration = connect_args.get("configuration", {})
+
+        # If using Hive, this will set hive.server2.proxy.user=$effective_username
+        configuration.update(
+            self.db_engine_spec.get_configuration_for_impersonation(
+                str(sqlalchemy_url), self.impersonate_user, effective_username
+            )
+        )
+        if configuration:
+            connect_args["configuration"] = configuration
+        if connect_args:
+            params["connect_args"] = connect_args
+
+        params.update(self.get_encrypted_extra())
+
+        if DB_CONNECTION_MUTATOR:
+            if not source and request and request.referrer:
+                if "/superset/dashboard/" in request.referrer:
+                    source = utils.QuerySource.DASHBOARD
+                elif "/superset/explore/" in request.referrer:
+                    source = utils.QuerySource.CHART
+                elif "/superset/sqllab/" in request.referrer:
+                    source = utils.QuerySource.SQL_LAB
+
+            sqlalchemy_url, params = DB_CONNECTION_MUTATOR(
+                sqlalchemy_url, params, effective_username, security_manager, source
+            )
+
+        return create_engine(sqlalchemy_url, **params)
+
+    def get_reserved_words(self) -> Set[str]:
+        return self.get_dialect().preparer.reserved_words
+
+    def get_quoter(self) -> Callable[[str, Any], str]:
+        return self.get_dialect().identifier_preparer.quote
+
+    def get_df(  # pylint: disable=too-many-locals
+        self,
+        sql: str,
+        schema: Optional[str] = None,
+        mutator: Optional[Callable[[pd.DataFrame], None]] = None,
+    ) -> pd.DataFrame:
+        sqls = [str(s).strip(" ;") for s in sqlparse.parse(sql)]
+
+        engine = self.get_sqla_engine(schema=schema)
+        username = utils.get_username()
+
+        def needs_conversion(df_series: pd.Series) -> bool:
+            return not df_series.empty and isinstance(df_series[0], (list, dict))
+
+        def _log_query(sql: str) -> None:
+            if log_query:
+                log_query(engine.url, sql, schema, username, __name__, security_manager)
+
+        with closing(engine.raw_connection()) as conn:
+            with closing(conn.cursor()) as cursor:
+                for sql_ in sqls[:-1]:
+                    _log_query(sql_)
+                    self.db_engine_spec.execute(cursor, sql_)
+                    cursor.fetchall()
+
+                _log_query(sqls[-1])
+                self.db_engine_spec.execute(cursor, sqls[-1])
+
+                if cursor.description is not None:
+                    columns = [col_desc[0] for col_desc in cursor.description]
+                else:
+                    columns = []
+
+                df = pd.DataFrame.from_records(
+                    data=list(cursor.fetchall()), columns=columns, coerce_float=True
+                )
+
+                if mutator:
+                    mutator(df)
+
+                for k, v in df.dtypes.items():
+                    if v.type == numpy.object_ and needs_conversion(df[k]):
+                        df[k] = df[k].apply(utils.json_dumps_w_dates)
+                return df
+
+    def compile_sqla_query(self, qry: Select, schema: Optional[str] = None) -> str:
+        engine = self.get_sqla_engine(schema=schema)
+
+        sql = str(qry.compile(engine, compile_kwargs={"literal_binds": True}))
+
+        if (
+            engine.dialect.identifier_preparer._double_percents  # pylint: disable=protected-access
+        ):
+            sql = sql.replace("%%", "%")
+
+        return sql
+
+    def select_star(  # pylint: disable=too-many-arguments
+        self,
+        table_name: str,
+        schema: Optional[str] = None,
+        limit: int = 100,
+        show_cols: bool = False,
+        indent: bool = True,
+        latest_partition: bool = False,
+        cols: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """Generates a ``select *`` statement in the proper dialect"""
+        eng = self.get_sqla_engine(schema=schema, source=utils.QuerySource.SQL_LAB)
+        return self.db_engine_spec.select_star(
+            self,
+            table_name,
+            schema=schema,
+            engine=eng,
+            limit=limit,
+            show_cols=show_cols,
+            indent=indent,
+            latest_partition=latest_partition,
+            cols=cols,
+        )
+
+    def apply_limit_to_sql(self, sql: str, limit: int = 1000) -> str:
+        return self.db_engine_spec.apply_limit_to_sql(sql, limit, self)
+
+    def safe_sqlalchemy_uri(self) -> str:
+        return self.sqlalchemy_uri
+
+    @property
+    def inspector(self) -> Inspector:
+        engine = self.get_sqla_engine()
+        return sqla.inspect(engine)
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: "db:{}:schema:None:table_list",
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "table")
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: "db:{}:schema:None:view_list", attribute_in_key="id"
+    )
+    def get_all_view_names_in_database(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[bool] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments."""
+        if not self.allow_multi_schema_metadata_fetch:
+            return []
+        return self.db_engine_spec.get_all_datasource_names(self, "view")
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: f"db:{{}}:schema:{kwargs.get('schema')}:table_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_table_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of tables
+        """
+        try:
+            tables = self.db_engine_spec.get_table_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [
+                utils.DatasourceName(table=table, schema=schema) for table in tables
+            ]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: f"db:{{}}:schema:{kwargs.get('schema')}:view_list",  # type: ignore
+        attribute_in_key="id",
+    )
+    def get_all_view_names_in_schema(
+        self,
+        schema: str,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[utils.DatasourceName]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param schema: schema name
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: list of views
+        """
+        try:
+            views = self.db_engine_spec.get_view_names(
+                database=self, inspector=self.inspector, schema=schema
+            )
+            return [utils.DatasourceName(table=view, schema=schema) for view in views]
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.exception(ex)
+
+    @cache_util.memoized_func(
+        key=lambda *args, **kwargs: "db:{}:schema_list", attribute_in_key="id"
+    )
+    def get_all_schema_names(
+        self,
+        cache: bool = False,
+        cache_timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> List[str]:
+        """Parameters need to be passed as keyword arguments.
+
+        For unused parameters, they are referenced in
+        cache_util.memoized_func decorator.
+
+        :param cache: whether cache is enabled for the function
+        :param cache_timeout: timeout in seconds for the cache
+        :param force: whether to force refresh the cache
+        :return: schema list
+        """
+        return self.db_engine_spec.get_schema_names(self.inspector)
+
+    @property
+    def db_engine_spec(self) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(self.backend, db_engine_specs.BaseEngineSpec)
+
+    @classmethod
+    def get_db_engine_spec_for_backend(
+        cls, backend: str
+    ) -> Type[db_engine_specs.BaseEngineSpec]:
+        return db_engine_specs.engines.get(backend, db_engine_specs.BaseEngineSpec)
+
+    def grains(self) -> Tuple[TimeGrain, ...]:
+        """Defines time granularity database-specific expressions.
+
+        The idea here is to make it easy for users to change the time grain
+        from a datetime (maybe the source grain is arbitrary timestamps, daily
+        or 5 minutes increments) to another, "truncated" datetime. Since
+        each database has slightly different but similar datetime functions,
+        this allows a mapping between database engines and actual functions.
+        """
+        return self.db_engine_spec.get_time_grains()
+
+    def get_extra(self) -> Dict[str, Any]:
+        return self.db_engine_spec.get_extra_params(self)
+
+    def get_encrypted_extra(self) -> Dict[str, Any]:
+        encrypted_extra = {}
+        if self.encrypted_extra:
+            try:
+                encrypted_extra = json.loads(self.encrypted_extra)
+            except json.JSONDecodeError as ex:
+                logger.error(ex)
+                raise ex
+        return encrypted_extra
+
+    def get_table(self, table_name: str, schema: Optional[str] = None) -> Table:
+        extra = self.get_extra()
+        meta = MetaData(**extra.get("metadata_params", {}))
+        return Table(
+            table_name,
+            meta,
+            schema=schema or None,
+            autoload=True,
+            autoload_with=self.get_sqla_engine(),
+        )
+
+    def get_columns(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.db_engine_spec.get_columns(self.inspector, table_name, schema)
+
+    def get_indexes(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_indexes(table_name, schema)
+
+    def get_pk_constraint(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> Dict[str, Any]:
+        return self.inspector.get_pk_constraint(table_name, schema)
+
+    def get_foreign_keys(
+        self, table_name: str, schema: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        return self.inspector.get_foreign_keys(table_name, schema)
+
+    def get_schema_access_for_csv_upload(  # pylint: disable=invalid-name
+        self,
+    ) -> List[str]:
+        allowed_databases = self.get_extra().get("schemas_allowed_for_csv_upload", [])
+        if hasattr(g, "user"):
+            extra_allowed_databases = config["ALLOWED_USER_CSV_SCHEMA_FUNC"](
+                self, g.user
+            )
+            allowed_databases += extra_allowed_databases
+        return sorted(set(allowed_databases))
+
+    @property
+    def sqlalchemy_uri_decrypted(self) -> str:
+        conn = sqla.engine.url.make_url(self.sqlalchemy_uri)
+        if custom_password_store:
+            conn.password = custom_password_store(conn)
+        else:
+            conn.password = self.password
+        return str(conn)
+
+    @property
+    def sql_url(self) -> str:
+        return f"/superset/sql/{self.id}/"
+
+    @hybrid_property
+    def perm(self) -> str:
+        return f"[{self.database_name}].(id:{self.id})"
+
+    @perm.expression  # type: ignore
+    def perm(cls) -> str:  # pylint: disable=no-self-argument
+        return (
+            "[" + cls.database_name + "].(id:" + expression.cast(cls.id, String) + ")"
+        )
+
+    def get_perm(self) -> str:
+        return self.perm  # type: ignore
+
+    def has_table(self, table: Table) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table.table_name, table.schema or None)
+
+    def has_table_by_name(self, table_name: str, schema: Optional[str] = None) -> bool:
+        engine = self.get_sqla_engine()
+        return engine.has_table(table_name, schema)
+
+    @utils.memoized
+    def get_dialect(self) -> Dialect:
+        sqla_url = url.make_url(self.sqlalchemy_uri_decrypted)
+        return sqla_url.get_dialect()()  # pylint: disable=no-member
+
+
+sqla.event.listen(Database, "after_insert", security_manager.set_perm)
+sqla.event.listen(Database, "after_update", security_manager.set_perm)
+
+
+class Log(Model):  # pylint: disable=too-few-public-methods
+
+    """ORM object used to log Superset actions to the database"""
+
+    __tablename__ = "logs"
+
+    id = Column(Integer, primary_key=True)
+    action = Column(String(512))
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    dashboard_id = Column(Integer)
+    slice_id = Column(Integer)
+    json = Column(Text)
+    user = relationship(
+        security_manager.user_model, backref="logs", foreign_keys=[user_id]
+    )
+    dttm = Column(DateTime, default=datetime.utcnow)
+    duration_ms = Column(Integer)
+    referrer = Column(String(1024))
+
+
+class FavStar(Model):  # pylint: disable=too-few-public-methods
+    __tablename__ = "favstar"
+
+    id = Column(Integer, primary_key=True)
+    user_id = Column(Integer, ForeignKey("ab_user.id"))
+    class_name = Column(String(50))
+    obj_id = Column(Integer)
+    dttm = Column(DateTime, default=datetime.utcnow)
+
+
+# events for updating tags
+if is_feature_enabled("TAGGING_SYSTEM"):
+    sqla.event.listen(Dashboard, "after_insert", DashboardUpdater.after_insert)
+    sqla.event.listen(Dashboard, "after_update", DashboardUpdater.after_update)
+    sqla.event.listen(Dashboard, "after_delete", DashboardUpdater.after_delete)
+    sqla.event.listen(FavStar, "after_insert", FavStarUpdater.after_insert)
+    sqla.event.listen(FavStar, "after_delete", FavStarUpdater.after_delete)

+ 197 - 0
data/purposeCombined/BI/etl_testing.py

@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue May 12 00:00:00 2020
+
+@author: Shaji
+"""
+
+from . import exceptions
+
+from datetime import datetime
+import os
+import pandas as pd
+
+def column_level_check(source_df,target_df,primary_keys):
+    """
+    Usage: [arg1]:[Pandas DataFrame - source], [arg2]:[Pandas DataFrame - target], [arg3]:[Primary keys (separated by comma)]
+    Description: Performs column level testing between two DataFrames by joining using the primary keys.
+    Returns: [Mismatch Count], [Test Log (list)], [Pandas dataframe - mismatch (if any)]
+    """
+    global execution_status
+
+    systime=datetime.now()
+
+    start_time=systime.strftime("%Y")+'-'+systime.strftime("%m")+'-'+systime.strftime("%d")+' '+systime.strftime("%H")+':'+systime.strftime("%M")+':'+systime.strftime("%S")
+
+    log_list=[]
+
+    execution_status='RUNNING'
+
+    log_list.append('START TIME: '+start_time)
+
+    key_list=primary_keys.split(',')
+
+    src=source_df
+    tgt=target_df
+
+    log_list.append(str(datetime.now())+': DIFFERENTIATING SOURCE AND TARGET COLUMNS')
+    if execution_status!='FAILED':
+        try:
+            src_k=[]
+            src_columns=[]
+            for i  in src.columns:
+                if str.lower(i) in [str.lower(key) for key in key_list]:
+                    src_columns.append(str.lower(i))
+                    src_k.append(str.lower(i))
+                else:
+                    src_columns.append(str(i) + '_src')
+            src.columns = src_columns
+            tgt_k=[]
+            tgt_columns=[]
+            for i  in tgt.columns:
+                if str.lower(i) in [str.lower(key) for key in key_list]:
+                    tgt_columns.append(str.lower(i))
+                    tgt_k.append(str.lower(i))
+                else:
+                    tgt_columns.append(str(i) + '_tgt')
+            tgt.columns = tgt_columns
+        except Exception as e:
+            print('Failed while DIFFERENTIATING SOURCE AND TARGET COLUMNS: '+str(e))
+            log_list.append('Failed while DIFFERENTIATING SOURCE AND TARGET COLUMNS: '+str(e))
+            execution_status='FAILED'
+    log_list.append(str(datetime.now())+': CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL')
+    if execution_status!='FAILED':
+        try:
+            index_unique_flag=[]
+            if src.groupby(src_k).count().shape[0]==src.shape[0]:
+                index_unique_flag.append(True)
+            else:
+                index_unique_flag.append(False)
+            if tgt.groupby(tgt_k).count().shape[0]==tgt.shape[0]:
+                index_unique_flag.append(True)
+            else:
+                index_unique_flag.append(False)
+        except Exception as e:
+            print('Failed while CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL: '+str(e))
+            log_list.append('Failed while CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL: '+str(e))
+            execution_status='FAILED'
+    if execution_status!='FAILED':
+        try:
+            if all(index_unique_flag)==True:
+                log_list.append(str(datetime.now())+': JOINING THE TABLES')
+                try:
+                    df=tgt.set_index(tgt_k).join(src.set_index(src_k),how='left')
+                except Exception as e:
+                    print('Failed while JOINING THE TABLES: '+str(e))
+                    log_list.append('Failed while JOINING THE TABLES: '+str(e))
+                    execution_status='FAILED'
+                log_list.append(str(datetime.now())+': FINDING THE TARGET COLUMN AND SOURCE COLUMN TO BE COMPARED')
+                if execution_status!='FAILED':
+                    try:
+                        ma_list=[]
+                        for i in range(len(df.columns)):
+                            if df.columns[i][-3:]=='tgt':
+                                for j in range(len(df.columns)):
+                                    if df.columns[j][-3:]=='src':
+                                        if str.lower(df.columns[i][:-4])==str.lower(df.columns[j][:-4]):
+                                            ma_list.append([j,i])
+                        match_cols=''
+                        for i in range(len(ma_list)):
+                            match_cols+=str(i+1)+': '+df.columns[ma_list[i][1]]+' = '+df.columns[ma_list[i][0]]+' , '
+                        log_list.append('Matching columns '+match_cols)
+                    except Exception as e:
+                        print('Failed while FINDING THE TARGET COLUMN AND SOURCE COLUMN TO BE COMPARED: '+str(e))
+                        log_list.append('Failed while FINDING THE TARGET COLUMN AND SOURCE COLUMN TO BE COMPARED: '+str(e))
+                        execution_status='FAILED'
+                log_list.append(str(datetime.now())+': COMPARISION STARTED')
+                if execution_status!='FAILED':
+                    try:
+                        mis_cols=[]
+                        res=[]
+                        index=[]
+                        for i in range(len(ma_list)):
+                            if all(df[df.columns[ma_list[i][0]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0))==df[df.columns[ma_list[i][1]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0)))==True:
+                                res.append(True)
+                            else:
+                                res.append(False)
+                                mis_cols.append(df.columns[ma_list[i][0]])
+                                mis_cols.append(df.columns[ma_list[i][1]])
+                                for j in range(len(df[df.columns[ma_list[i][0]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0))==df[df.columns[ma_list[i][1]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0)))):
+                                    if list(df[df.columns[ma_list[i][0]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0))==df[df.columns[ma_list[i][1]]].apply(lambda x:str(x).strip()).astype(str).fillna(str(0)))[j]==False:
+                                        index.append(j)
+                        un_df=df[mis_cols].iloc[list(set(index))]
+                    except Exception as e:
+                        print('Failed while COMPARING: '+str(e))
+                        log_list.append('Failed while COMPARING: '+str(e))
+                        execution_status='FAILED'
+                log_list.append(str(datetime.now())+': TEST RESULT:')
+                if execution_status!='FAILED':
+                    try:
+                        if all(res)==True:
+                            mismatch_count=0
+                            print('COLUMN LEVEL CHECK PASSED')
+                            execution_status='SUCCESS'
+                            log_list.append('COLUMN LEVEL CHECK PASSED')
+                        else:
+                            log_list.append((str(len(set(index)))+' records unmatched'))
+                            log_list.append('Column level check Failed')
+                            mismatch_count=str(len(set(index)))
+                            execution_status='SUCCESS'
+                    except Exception as e:
+                        print('Failed while getting the TEST RESULT: '+str(e))
+                        log_list.append('Failed while getting the TEST RESULT: '+str(e))
+                        execution_status='FAILED'
+            else:
+                log_list.append('The records grouped at the level of key columns are not unique')
+        except Exception as e:
+            log_list.append('Failed while CHECKING IF THE GROUP BY MAKES THE RECORD LEVEL SAME AS ACTUAL: '+str(e))
+            execution_status='FAILED'
+    if execution_status=='FAILED':
+        print('Check Logs for the error message')
+        raise exceptionsExecutionError
+    return mismatch_count,log_list,un_df
+
+def sort_and_compare(source_df,target_df):
+    """
+    Usage: [arg1]:[Pandas DataFrame - source], [arg2]:[Pandas DataFrame - target]
+    Description: Sort and Compare two datasets.
+    Returns: [Mismatch Count], [Test Log (list)], [Pandas dataframe - mismatch (if any)]
+    """
+    log_list=[]
+    col1=source_df.columns
+    col2=target_df.columns
+    cols=list(set(col1.sort_values()).intersection(set(col2.sort_values())))
+    log_list.append('Common column(s): '+', '.join(cols))
+
+    source_df.sort_values(cols, axis=0, ascending=True, inplace=True)
+    target_df.sort_values(cols, axis=0, ascending=True, inplace=True)
+
+    data1=source_df[cols].reset_index(drop=True)
+    data2=target_df[cols].reset_index(drop=True)
+
+    data1.head()
+    data2.head()
+
+    result=data1==data2
+    bool_list=[]
+    mis_cols=[]
+    mis_index=[]
+    for i in cols:
+        if all(result[i])==True:
+            bool_list.append(True)
+        else:
+            bool_list.append(False)
+            mis_cols.append(i)
+            for j in range(len(result[i])):
+                if result[i][j]==False:
+                    mis_index.append(j)
+    un_df=pd.concat([data1.iloc[list(set(mis_index))],data2.iloc[list(set(mis_index))]],axis=1)
+
+    mismatch_count=0
+    if all(bool_list)==True:
+        log_list.append('Records are matching')
+    else:
+        mismatch_count=len(set(mis_index))
+        log_list.append(str(mismatch_count)+' records unmatched')
+        log_list.append('Column(s): '+', '.join(mis_cols))
+    return mismatch_count,log_list,un_df[mis_cols]

+ 33 - 0
data/purposeCombined/BI/examples/__init__.py

@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from .bart_lines import load_bart_lines
+from .birth_names import load_birth_names
+from .country_map import load_country_map_data
+from .css_templates import load_css_templates
+from .deck import load_deck_dash
+from .energy import load_energy
+from .flights import load_flights
+from .long_lat import load_long_lat_data
+from .misc_dashboard import load_misc_dashboard
+from .multi_line import load_multi_line
+from .multiformat_time_series import load_multiformat_time_series
+from .paris import load_paris_iris_geojson
+from .random_time_series import load_random_time_series_data
+from .sf_population_polygons import load_sf_population_polygons
+from .tabbed_dashboard import load_tabbed_dashboard
+from .unicode_test_data import load_unicode_test_data
+from .world_bank import load_world_bank_health_n_pop

+ 63 - 0
data/purposeCombined/BI/examples/bart_lines.py

@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+import pandas as pd
+import polyline
+from sqlalchemy import String, Text
+
+from superset import db
+from superset.utils.core import get_example_database
+
+from .helpers import get_example_data, TBL
+
+
+def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
+    tbl_name = "bart_lines"
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        content = get_example_data("bart-lines.json.gz")
+        df = pd.read_json(content, encoding="latin-1")
+        df["path_json"] = df.path.map(json.dumps)
+        df["polyline"] = df.path.map(polyline.encode)
+        del df["path"]
+
+        df.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "color": String(255),
+                "name": String(255),
+                "polyline": Text,
+                "path_json": Text,
+            },
+            index=False,
+        )
+
+    print("Creating table {} reference".format(tbl_name))
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "BART lines"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()

+ 763 - 0
data/purposeCombined/BI/examples/birth_names.py

@@ -0,0 +1,763 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import textwrap
+from typing import Dict, Union
+
+import pandas as pd
+from sqlalchemy import DateTime, String
+from sqlalchemy.sql import column
+
+from superset import db, security_manager
+from superset.connectors.sqla.models import SqlMetric, TableColumn
+from superset.models.core import Database
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.utils.core import get_example_database
+
+from .helpers import (
+    config,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+    update_slice_ids,
+)
+
+
+def gen_filter(
+    subject: str, comparator: str, operator: str = "=="
+) -> Dict[str, Union[bool, str]]:
+    return {
+        "clause": "WHERE",
+        "comparator": comparator,
+        "expressionType": "SIMPLE",
+        "operator": operator,
+        "subject": subject,
+    }
+
+
+def load_data(tbl_name: str, database: Database) -> None:
+    pdf = pd.read_json(get_example_data("birth_names.json.gz"))
+    pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
+    pdf.to_sql(
+        tbl_name,
+        database.get_sqla_engine(),
+        if_exists="replace",
+        chunksize=500,
+        dtype={
+            "ds": DateTime,
+            "gender": String(16),
+            "state": String(10),
+            "name": String(255),
+        },
+        index=False,
+    )
+    print("Done loading table!")
+    print("-" * 80)
+
+
+def load_birth_names(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading birth name dataset from a zip file in the repo"""
+    # pylint: disable=too-many-locals
+    tbl_name = "birth_names"
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        load_data(tbl_name, database)
+
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        print(f"Creating table [{tbl_name}] reference")
+        obj = TBL(table_name=tbl_name)
+        db.session.add(obj)
+    obj.main_dttm_col = "ds"
+    obj.database = database
+    obj.filter_select_enabled = True
+
+    if not any(col.column_name == "num_california" for col in obj.columns):
+        col_state = str(column("state").compile(db.engine))
+        col_num = str(column("num").compile(db.engine))
+        obj.columns.append(
+            TableColumn(
+                column_name="num_california",
+                expression=f"CASE WHEN {col_state} = 'CA' THEN {col_num} ELSE 0 END",
+            )
+        )
+
+    if not any(col.metric_name == "sum__num" for col in obj.metrics):
+        col = str(column("num").compile(db.engine))
+        obj.metrics.append(SqlMetric(metric_name="sum__num", expression=f"SUM({col})"))
+
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    metrics = [
+        {
+            "expressionType": "SIMPLE",
+            "column": {"column_name": "num", "type": "BIGINT"},
+            "aggregate": "SUM",
+            "label": "Births",
+            "optionName": "metric_11",
+        }
+    ]
+    metric = "sum__num"
+
+    defaults = {
+        "compare_lag": "10",
+        "compare_suffix": "o10Y",
+        "limit": "25",
+        "granularity_sqla": "ds",
+        "groupby": [],
+        "row_limit": config["ROW_LIMIT"],
+        "since": "100 years ago",
+        "until": "now",
+        "viz_type": "table",
+        "markup_type": "markdown",
+    }
+
+    admin = security_manager.find_user("admin")
+
+    print("Creating some slices")
+    slices = [
+        Slice(
+            slice_name="Participants",
+            viz_type="big_number",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="big_number",
+                granularity_sqla="ds",
+                compare_lag="5",
+                compare_suffix="over 5Y",
+                metric=metric,
+            ),
+        ),
+        Slice(
+            slice_name="Genders",
+            viz_type="pie",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults, viz_type="pie", groupby=["gender"], metric=metric
+            ),
+        ),
+        Slice(
+            slice_name="Trends",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="line",
+                groupby=["name"],
+                granularity_sqla="ds",
+                rich_tooltip=True,
+                show_legend=True,
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Genders by State",
+            viz_type="dist_bar",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                adhoc_filters=[
+                    {
+                        "clause": "WHERE",
+                        "expressionType": "SIMPLE",
+                        "filterOptionName": "2745eae5",
+                        "comparator": ["other"],
+                        "operator": "NOT IN",
+                        "subject": "state",
+                    }
+                ],
+                viz_type="dist_bar",
+                metrics=[
+                    {
+                        "expressionType": "SIMPLE",
+                        "column": {"column_name": "sum_boys", "type": "BIGINT(20)"},
+                        "aggregate": "SUM",
+                        "label": "Boys",
+                        "optionName": "metric_11",
+                    },
+                    {
+                        "expressionType": "SIMPLE",
+                        "column": {"column_name": "sum_girls", "type": "BIGINT(20)"},
+                        "aggregate": "SUM",
+                        "label": "Girls",
+                        "optionName": "metric_12",
+                    },
+                ],
+                groupby=["state"],
+            ),
+        ),
+        Slice(
+            slice_name="Girls",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                groupby=["name"],
+                adhoc_filters=[gen_filter("gender", "girl")],
+                row_limit=50,
+                timeseries_limit_metric="sum__num",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Girl Name Cloud",
+            viz_type="word_cloud",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="word_cloud",
+                size_from="10",
+                series="name",
+                size_to="70",
+                rotation="square",
+                limit="100",
+                adhoc_filters=[gen_filter("gender", "girl")],
+                metric=metric,
+            ),
+        ),
+        Slice(
+            slice_name="Boys",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                groupby=["name"],
+                adhoc_filters=[gen_filter("gender", "boy")],
+                row_limit=50,
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Boy Name Cloud",
+            viz_type="word_cloud",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="word_cloud",
+                size_from="10",
+                series="name",
+                size_to="70",
+                rotation="square",
+                limit="100",
+                adhoc_filters=[gen_filter("gender", "boy")],
+                metric=metric,
+            ),
+        ),
+        Slice(
+            slice_name="Top 10 Girl Name Share",
+            viz_type="area",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                adhoc_filters=[gen_filter("gender", "girl")],
+                comparison_type="values",
+                groupby=["name"],
+                limit=10,
+                stacked_style="expand",
+                time_grain_sqla="P1D",
+                viz_type="area",
+                x_axis_forma="smart_date",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Top 10 Boy Name Share",
+            viz_type="area",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                adhoc_filters=[gen_filter("gender", "boy")],
+                comparison_type="values",
+                groupby=["name"],
+                limit=10,
+                stacked_style="expand",
+                time_grain_sqla="P1D",
+                viz_type="area",
+                x_axis_forma="smart_date",
+                metrics=metrics,
+            ),
+        ),
+    ]
+    misc_slices = [
+        Slice(
+            slice_name="Average and Sum Trends",
+            viz_type="dual_line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="dual_line",
+                metric={
+                    "expressionType": "SIMPLE",
+                    "column": {"column_name": "num", "type": "BIGINT(20)"},
+                    "aggregate": "AVG",
+                    "label": "AVG(num)",
+                    "optionName": "metric_vgops097wej_g8uff99zhk7",
+                },
+                metric_2="sum__num",
+                granularity_sqla="ds",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Num Births Trend",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(defaults, viz_type="line", metrics=metrics),
+        ),
+        Slice(
+            slice_name="Daily Totals",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            created_by=admin,
+            params=get_slice_json(
+                defaults,
+                groupby=["ds"],
+                since="40 years ago",
+                until="now",
+                viz_type="table",
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Number of California Births",
+            viz_type="big_number_total",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metric={
+                    "expressionType": "SIMPLE",
+                    "column": {
+                        "column_name": "num_california",
+                        "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                    },
+                    "aggregate": "SUM",
+                    "label": "SUM(num_california)",
+                },
+                viz_type="big_number_total",
+                granularity_sqla="ds",
+            ),
+        ),
+        Slice(
+            slice_name="Top 10 California Names Timeseries",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metrics=[
+                    {
+                        "expressionType": "SIMPLE",
+                        "column": {
+                            "column_name": "num_california",
+                            "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                        },
+                        "aggregate": "SUM",
+                        "label": "SUM(num_california)",
+                    }
+                ],
+                viz_type="line",
+                granularity_sqla="ds",
+                groupby=["name"],
+                timeseries_limit_metric={
+                    "expressionType": "SIMPLE",
+                    "column": {
+                        "column_name": "num_california",
+                        "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                    },
+                    "aggregate": "SUM",
+                    "label": "SUM(num_california)",
+                },
+                limit="10",
+            ),
+        ),
+        Slice(
+            slice_name="Names Sorted by Num in California",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metrics=metrics,
+                groupby=["name"],
+                row_limit=50,
+                timeseries_limit_metric={
+                    "expressionType": "SIMPLE",
+                    "column": {
+                        "column_name": "num_california",
+                        "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
+                    },
+                    "aggregate": "SUM",
+                    "label": "SUM(num_california)",
+                },
+            ),
+        ),
+        Slice(
+            slice_name="Number of Girls",
+            viz_type="big_number_total",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                metric=metric,
+                viz_type="big_number_total",
+                granularity_sqla="ds",
+                adhoc_filters=[gen_filter("gender", "girl")],
+                subheader="total female participants",
+            ),
+        ),
+        Slice(
+            slice_name="Pivot Table",
+            viz_type="pivot_table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="pivot_table",
+                groupby=["name"],
+                columns=["state"],
+                metrics=metrics,
+            ),
+        ),
+    ]
+    for slc in slices:
+        merge_slice(slc)
+
+    for slc in misc_slices:
+        merge_slice(slc)
+        misc_dash_slices.add(slc.slice_name)
+
+    print("Creating a dashboard")
+    dash = db.session.query(Dashboard).filter_by(slug="births").first()
+
+    if not dash:
+        dash = Dashboard()
+        db.session.add(dash)
+    dash.published = True
+    dash.json_metadata = textwrap.dedent(
+        """\
+    {
+        "label_colors": {
+            "Girls": "#FF69B4",
+            "Boys": "#ADD8E6",
+            "girl": "#FF69B4",
+            "boy": "#ADD8E6"
+        }
+    }"""
+    )
+    js = textwrap.dedent(
+        # pylint: disable=line-too-long
+        """\
+        {
+          "CHART-6GdlekVise": {
+            "children": [],
+            "id": "CHART-6GdlekVise",
+            "meta": {
+              "chartId": 5547,
+              "height": 50,
+              "sliceName": "Top 10 Girl Name Share",
+              "width": 5
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-eh0w37bWbR"
+            ],
+            "type": "CHART"
+          },
+          "CHART-6n9jxb30JG": {
+            "children": [],
+            "id": "CHART-6n9jxb30JG",
+            "meta": {
+              "chartId": 5540,
+              "height": 36,
+              "sliceName": "Genders by State",
+              "width": 5
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW--EyBZQlDi"
+            ],
+            "type": "CHART"
+          },
+          "CHART-Jj9qh1ol-N": {
+            "children": [],
+            "id": "CHART-Jj9qh1ol-N",
+            "meta": {
+              "chartId": 5545,
+              "height": 50,
+              "sliceName": "Boy Name Cloud",
+              "width": 4
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-kzWtcvo8R1"
+            ],
+            "type": "CHART"
+          },
+          "CHART-ODvantb_bF": {
+            "children": [],
+            "id": "CHART-ODvantb_bF",
+            "meta": {
+              "chartId": 5548,
+              "height": 50,
+              "sliceName": "Top 10 Boy Name Share",
+              "width": 5
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-kzWtcvo8R1"
+            ],
+            "type": "CHART"
+          },
+          "CHART-PAXUUqwmX9": {
+            "children": [],
+            "id": "CHART-PAXUUqwmX9",
+            "meta": {
+              "chartId": 5538,
+              "height": 34,
+              "sliceName": "Genders",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-2n0XgiHDgs"
+            ],
+            "type": "CHART"
+          },
+          "CHART-_T6n_K9iQN": {
+            "children": [],
+            "id": "CHART-_T6n_K9iQN",
+            "meta": {
+              "chartId": 5539,
+              "height": 36,
+              "sliceName": "Trends",
+              "width": 7
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW--EyBZQlDi"
+            ],
+            "type": "CHART"
+          },
+          "CHART-eNY0tcE_ic": {
+            "children": [],
+            "id": "CHART-eNY0tcE_ic",
+            "meta": {
+              "chartId": 5537,
+              "height": 34,
+              "sliceName": "Participants",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-2n0XgiHDgs"
+            ],
+            "type": "CHART"
+          },
+          "CHART-g075mMgyYb": {
+            "children": [],
+            "id": "CHART-g075mMgyYb",
+            "meta": {
+              "chartId": 5541,
+              "height": 50,
+              "sliceName": "Girls",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-eh0w37bWbR"
+            ],
+            "type": "CHART"
+          },
+          "CHART-n-zGGE6S1y": {
+            "children": [],
+            "id": "CHART-n-zGGE6S1y",
+            "meta": {
+              "chartId": 5542,
+              "height": 50,
+              "sliceName": "Girl Name Cloud",
+              "width": 4
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-eh0w37bWbR"
+            ],
+            "type": "CHART"
+          },
+          "CHART-vJIPjmcbD3": {
+            "children": [],
+            "id": "CHART-vJIPjmcbD3",
+            "meta": {
+              "chartId": 5543,
+              "height": 50,
+              "sliceName": "Boys",
+              "width": 3
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-kzWtcvo8R1"
+            ],
+            "type": "CHART"
+          },
+          "DASHBOARD_VERSION_KEY": "v2",
+          "GRID_ID": {
+            "children": [
+              "ROW-2n0XgiHDgs",
+              "ROW--EyBZQlDi",
+              "ROW-eh0w37bWbR",
+              "ROW-kzWtcvo8R1"
+            ],
+            "id": "GRID_ID",
+            "parents": [
+              "ROOT_ID"
+            ],
+            "type": "GRID"
+          },
+          "HEADER_ID": {
+            "id": "HEADER_ID",
+            "meta": {
+              "text": "Births"
+            },
+            "type": "HEADER"
+          },
+          "MARKDOWN-zaflB60tbC": {
+            "children": [],
+            "id": "MARKDOWN-zaflB60tbC",
+            "meta": {
+              "code": "<div style=\\"text-align:center\\">  <h1>Birth Names Dashboard</h1>  <img src=\\"/static/assets/images/babies.png\\" style=\\"width:50%;\\"></div>",
+              "height": 34,
+              "width": 6
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID",
+              "ROW-2n0XgiHDgs"
+            ],
+            "type": "MARKDOWN"
+          },
+          "ROOT_ID": {
+            "children": [
+              "GRID_ID"
+            ],
+            "id": "ROOT_ID",
+            "type": "ROOT"
+          },
+          "ROW--EyBZQlDi": {
+            "children": [
+              "CHART-_T6n_K9iQN",
+              "CHART-6n9jxb30JG"
+            ],
+            "id": "ROW--EyBZQlDi",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          },
+          "ROW-2n0XgiHDgs": {
+            "children": [
+              "CHART-eNY0tcE_ic",
+              "MARKDOWN-zaflB60tbC",
+              "CHART-PAXUUqwmX9"
+            ],
+            "id": "ROW-2n0XgiHDgs",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          },
+          "ROW-eh0w37bWbR": {
+            "children": [
+              "CHART-g075mMgyYb",
+              "CHART-n-zGGE6S1y",
+              "CHART-6GdlekVise"
+            ],
+            "id": "ROW-eh0w37bWbR",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          },
+          "ROW-kzWtcvo8R1": {
+            "children": [
+              "CHART-vJIPjmcbD3",
+              "CHART-Jj9qh1ol-N",
+              "CHART-ODvantb_bF"
+            ],
+            "id": "ROW-kzWtcvo8R1",
+            "meta": {
+              "background": "BACKGROUND_TRANSPARENT"
+            },
+            "parents": [
+              "ROOT_ID",
+              "GRID_ID"
+            ],
+            "type": "ROW"
+          }
+        }
+        """  # pylint: enable=line-too-long
+    )
+    pos = json.loads(js)
+    # dashboard v2 doesn't allow add markup slice
+    dash.slices = [slc for slc in slices if slc.viz_type != "markup"]
+    update_slice_ids(pos, dash.slices)
+    dash.dashboard_title = "USA Births Names"
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = "births"
+    db.session.commit()

+ 373 - 0
data/purposeCombined/BI/examples/countries.md

@@ -0,0 +1,373 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+This data was downloaded from the
+[World's Health Organization's website](https://datacatalog.worldbank.org/dataset/health-nutrition-and-population-statistics)
+
+Here's the script that was used to massage the data:
+
+    DIR = ""
+    df_country = pd.read_csv(DIR + '/HNP_Country.csv')
+    df_country.columns = ['country_code'] + list(df_country.columns[1:])
+    df_country = df_country[['country_code', 'Region']]
+    df_country.columns = ['country_code', 'region']
+
+    df = pd.read_csv(DIR + '/HNP_Data.csv')
+    del df['Unnamed: 60']
+    df.columns = ['country_name', 'country_code'] + list(df.columns[2:])
+    ndf = df.merge(df_country, how='inner')
+
+    dims = ('country_name', 'country_code', 'region')
+    vv = [str(i) for i in range(1960, 2015)]
+    mdf = pd.melt(ndf, id_vars=dims + ('Indicator Code',), value_vars=vv)
+    mdf['year'] = mdf.variable + '-01-01'
+    dims = dims + ('year',)
+
+    pdf = mdf.pivot_table(values='value', columns='Indicator Code', index=dims)
+    pdf = pdf.reset_index()
+    pdf.to_csv(DIR + '/countries.csv')
+    pdf.to_json(DIR + '/countries.json', orient='records')
+
+Here's the description of the metrics available:
+
+Series | Code Indicator Name
+--- | ---
+NY.GNP.PCAP.CD | GNI per capita, Atlas method (current US$)
+SE.ADT.1524.LT.FM.ZS | Literacy rate, youth (ages 15-24), gender parity index (GPI)
+SE.ADT.1524.LT.MA.ZS | Literacy rate, youth male (% of males ages 15-24)
+SE.ADT.1524.LT.ZS | Literacy rate, youth total (% of people ages 15-24)
+SE.ADT.LITR.FE.ZS | Literacy rate, adult female (% of females ages 15 and above)
+SE.ADT.LITR.MA.ZS | Literacy rate, adult male (% of males ages 15 and above)
+SE.ADT.LITR.ZS | Literacy rate, adult total (% of people ages 15 and above)
+SE.ENR.ORPH | Ratio of school attendance of orphans to school attendance of non-orphans ages 10-14
+SE.PRM.CMPT.FE.ZS | Primary completion rate, female (% of relevant age group)
+SE.PRM.CMPT.MA.ZS | Primary completion rate, male (% of relevant age group)
+SE.PRM.CMPT.ZS | Primary completion rate, total (% of relevant age group)
+SE.PRM.ENRR | School enrollment, primary (% gross)
+SE.PRM.ENRR.FE | School enrollment, primary, female (% gross)
+SE.PRM.ENRR.MA | School enrollment, primary, male (% gross)
+SE.PRM.NENR | School enrollment, primary (% net)
+SE.PRM.NENR.FE | School enrollment, primary, female (% net)
+SE.PRM.NENR.MA | School enrollment, primary, male (% net)
+SE.SEC.ENRR | School enrollment, secondary (% gross)
+SE.SEC.ENRR.FE | School enrollment, secondary, female (% gross)
+SE.SEC.ENRR.MA | School enrollment, secondary, male (% gross)
+SE.SEC.NENR | School enrollment, secondary (% net)
+SE.SEC.NENR.FE | School enrollment, secondary, female (% net)
+SE.SEC.NENR.MA | School enrollment, secondary, male (% net)
+SE.TER.ENRR | School enrollment, tertiary (% gross)
+SE.TER.ENRR.FE | School enrollment, tertiary, female (% gross)
+SE.XPD.TOTL.GD.ZS | Government expenditure on education, total (% of GDP)
+SH.ANM.CHLD.ZS | Prevalence of anemia among children (% of children under 5)
+SH.ANM.NPRG.ZS | Prevalence of anemia among non-pregnant women (% of women ages 15-49)
+SH.CON.1524.FE.ZS | Condom use, population ages 15-24, female (% of females ages 15-24)
+SH.CON.1524.MA.ZS | Condom use, population ages 15-24, male (% of males ages 15-24)
+SH.CON.AIDS.FE.ZS | Condom use at last high-risk sex, adult female (% ages 15-49)
+SH.CON.AIDS.MA.ZS | Condom use at last high-risk sex, adult male (% ages 15-49)
+SH.DTH.COMM.ZS | Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)
+SH.DTH.IMRT | Number of infant deaths
+SH.DTH.INJR.ZS | Cause of death, by injury (% of total)
+SH.DTH.MORT | Number of under-five deaths
+SH.DTH.NCOM.ZS | Cause of death, by non-communicable diseases (% of total)
+SH.DTH.NMRT | Number of neonatal deaths
+SH.DYN.AIDS | Adults (ages 15+) living with HIV
+SH.DYN.AIDS.DH | AIDS estimated deaths (UNAIDS estimates)
+SH.DYN.AIDS.FE.ZS | Women's share of population ages 15+ living with HIV (%)
+SH.DYN.AIDS.ZS | Prevalence of HIV, total (% of population ages 15-49)
+SH.DYN.MORT | Mortality rate, under-5 (per 1,000 live births)
+SH.DYN.MORT.FE | Mortality rate, under-5, female (per 1,000 live births)
+SH.DYN.MORT.MA | Mortality rate, under-5, male (per 1,000 live births)
+SH.DYN.NMRT | Mortality rate, neonatal (per 1,000 live births)
+SH.FPL.SATI.ZS | Met need for contraception (% of married women ages 15-49)
+SH.H2O.SAFE.RU.ZS | Improved water source, rural (% of rural population with access)
+SH.H2O.SAFE.UR.ZS | Improved water source, urban (% of urban population with access)
+SH.H2O.SAFE.ZS | Improved water source (% of population with access)
+SH.HIV.0014 | Children (0-14) living with HIV
+SH.HIV.1524.FE.ZS | Prevalence of HIV, female (% ages 15-24)
+SH.HIV.1524.KW.FE.ZS | Comprehensive correct knowledge of HIV/AIDS, ages 15-24, female (2 prevent ways and reject 3 misconceptions)
+SH.HIV.1524.KW.MA.ZS | Comprehensive correct knowledge of HIV/AIDS, ages 15-24, male (2 prevent ways and reject 3 misconceptions)
+SH.HIV.1524.MA.ZS | Prevalence of HIV, male (% ages 15-24)
+SH.HIV.ARTC.ZS | Antiretroviral therapy coverage (% of people living with HIV)
+SH.HIV.KNOW.FE.ZS | % of females ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)
+SH.HIV.KNOW.MA.ZS | % of males ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)
+SH.HIV.ORPH | Children orphaned by HIV/AIDS
+SH.HIV.TOTL | Adults (ages 15+) and children (0-14 years) living with HIV
+SH.IMM.HEPB | Immunization, HepB3 (% of one-year-old children)
+SH.IMM.HIB3 | Immunization, Hib3 (% of children ages 12-23 months)
+SH.IMM.IBCG | Immunization, BCG (% of one-year-old children)
+SH.IMM.IDPT | Immunization, DPT (% of children ages 12-23 months)
+SH.IMM.MEAS | Immunization, measles (% of children ages 12-23 months)
+SH.IMM.POL3 | Immunization, Pol3 (% of one-year-old children)
+SH.MED.BEDS.ZS | Hospital beds (per 1,000 people)
+SH.MED.CMHW.P3 | Community health workers (per 1,000 people)
+SH.MED.NUMW.P3 | Nurses and midwives (per 1,000 people)
+SH.MED.PHYS.ZS | Physicians (per 1,000 people)
+SH.MLR.NETS.ZS | Use of insecticide-treated bed nets (% of under-5 population)
+SH.MLR.PREG.ZS | Use of any antimalarial drug (% of pregnant women)
+SH.MLR.SPF2.ZS | Use of Intermittent Preventive Treatment of malaria, 2+ doses of SP/Fansidar (% of pregnant women)
+SH.MLR.TRET.ZS | Children with fever receiving antimalarial drugs (% of children under age 5 with fever)
+SH.MMR.DTHS | Number of maternal deaths
+SH.MMR.LEVE | Number of weeks of maternity leave
+SH.MMR.RISK | Lifetime risk of maternal death (1 in: rate varies by country)
+SH.MMR.RISK.ZS | Lifetime risk of maternal death (%)
+SH.MMR.WAGE.ZS | Maternal leave benefits (% of wages paid in covered period)
+SH.PRG.ANEM | Prevalence of anemia among pregnant women (%)
+SH.PRG.ARTC.ZS | Antiretroviral therapy coverage (% of pregnant women living with HIV)
+SH.PRG.SYPH.ZS | Prevalence of syphilis (% of women attending antenatal care)
+SH.PRV.SMOK.FE | Smoking prevalence, females (% of adults)
+SH.PRV.SMOK.MA | Smoking prevalence, males (% of adults)
+SH.STA.ACSN | Improved sanitation facilities (% of population with access)
+SH.STA.ACSN.RU | Improved sanitation facilities, rural (% of rural population with access)
+SH.STA.ACSN.UR | Improved sanitation facilities, urban (% of urban population with access)
+SH.STA.ANV4.ZS | Pregnant women receiving prenatal care of at least four visits (% of pregnant women)
+SH.STA.ANVC.ZS | Pregnant women receiving prenatal care (%)
+SH.STA.ARIC.ZS | ARI treatment (% of children under 5 taken to a health provider)
+SH.STA.BFED.ZS | Exclusive breastfeeding (% of children under 6 months)
+SH.STA.BRTC.ZS | Births attended by skilled health staff (% of total)
+SH.STA.BRTW.ZS | Low-birthweight babies (% of births)
+SH.STA.DIAB.ZS | Diabetes prevalence (% of population ages 20 to 79)
+SH.STA.IYCF.ZS | Infant and young child feeding practices, all 3 IYCF (% children ages 6-23 months)
+SH.STA.MALN.FE.ZS | Prevalence of underweight, weight for age, female (% of children under 5)
+SH.STA.MALN.MA.ZS | Prevalence of underweight, weight for age, male (% of children under 5)
+SH.STA.MALN.ZS | Prevalence of underweight, weight for age (% of children under 5)
+SH.STA.MALR | Malaria cases reported
+SH.STA.MMRT | Maternal mortality ratio (modeled estimate, per 100,000 live births)
+SH.STA.MMRT.NE | Maternal mortality ratio (national estimate, per 100,000 live births)
+SH.STA.ORCF.ZS | Diarrhea treatment (% of children under 5 receiving oral rehydration and continued feeding)
+SH.STA.ORTH | Diarrhea treatment (% of children under 5 who received ORS packet)
+SH.STA.OW15.FE.ZS | Prevalence of overweight, female (% of female adults)
+SH.STA.OW15.MA.ZS | Prevalence of overweight, male (% of male adults)
+SH.STA.OW15.ZS | Prevalence of overweight (% of adults)
+SH.STA.OWGH.FE.ZS | Prevalence of overweight, weight for height, female (% of children under 5)
+SH.STA.OWGH.MA.ZS | Prevalence of overweight, weight for height, male (% of children under 5)
+SH.STA.OWGH.ZS | Prevalence of overweight, weight for height (% of children under 5)
+SH.STA.PNVC.ZS | Postnatal care coverage (% mothers)
+SH.STA.STNT.FE.ZS | Prevalence of stunting, height for age, female (% of children under 5)
+SH.STA.STNT.MA.ZS | Prevalence of stunting, height for age, male (% of children under 5)
+SH.STA.STNT.ZS | Prevalence of stunting, height for age (% of children under 5)
+SH.STA.WAST.FE.ZS | Prevalence of wasting, weight for height, female (% of children under 5)
+SH.STA.WAST.MA.ZS | Prevalence of wasting, weight for height, male (% of children under 5)
+SH.STA.WAST.ZS | Prevalence of wasting, weight for height (% of children under 5)
+SH.SVR.WAST.FE.ZS | Prevalence of severe wasting, weight for height, female (% of children under 5)
+SH.SVR.WAST.MA.ZS | Prevalence of severe wasting, weight for height, male (% of children under 5)
+SH.SVR.WAST.ZS | Prevalence of severe wasting, weight for height (% of children under 5)
+SH.TBS.CURE.ZS | Tuberculosis treatment success rate (% of new cases)
+SH.TBS.DTEC.ZS | Tuberculosis case detection rate (%, all forms)
+SH.TBS.INCD | Incidence of tuberculosis (per 100,000 people)
+SH.TBS.MORT | Tuberculosis death rate (per 100,000 people)
+SH.TBS.PREV | Prevalence of tuberculosis (per 100,000 population)
+SH.VAC.TTNS.ZS | Newborns protected against tetanus (%)
+SH.XPD.EXTR.ZS | External resources for health (% of total expenditure on health)
+SH.XPD.OOPC.TO.ZS | Out-of-pocket health expenditure (% of total expenditure on health)
+SH.XPD.OOPC.ZS | Out-of-pocket health expenditure (% of private expenditure on health)
+SH.XPD.PCAP | Health expenditure per capita (current US$)
+SH.XPD.PCAP.PP.KD | Health expenditure per capita, PPP (constant 2011 international $)
+SH.XPD.PRIV | Health expenditure, private (% of total health expenditure)
+SH.XPD.PRIV.ZS | Health expenditure, private (% of GDP)
+SH.XPD.PUBL | Health expenditure, public (% of total health expenditure)
+SH.XPD.PUBL.GX.ZS | Health expenditure, public (% of government expenditure)
+SH.XPD.PUBL.ZS | Health expenditure, public (% of GDP)
+SH.XPD.TOTL.CD | Health expenditure, total (current US$)
+SH.XPD.TOTL.ZS | Health expenditure, total (% of GDP)
+SI.POV.NAHC | Poverty headcount ratio at national poverty lines (% of population)
+SI.POV.RUHC | Rural poverty headcount ratio at national poverty lines (% of rural population)
+SI.POV.URHC | Urban poverty headcount ratio at national poverty lines (% of urban population)
+SL.EMP.INSV.FE.ZS | Share of women in wage employment in the nonagricultural sector (% of total nonagricultural employment)
+SL.TLF.TOTL.FE.ZS | Labor force, female (% of total labor force)
+SL.TLF.TOTL.IN | Labor force, total
+SL.UEM.TOTL.FE.ZS | Unemployment, female (% of female labor force) (modeled ILO estimate)
+SL.UEM.TOTL.MA.ZS | Unemployment, male (% of male labor force) (modeled ILO estimate)
+SL.UEM.TOTL.ZS | Unemployment, total (% of total labor force) (modeled ILO estimate)
+SM.POP.NETM | Net migration
+SN.ITK.DEFC | Number of people who are undernourished
+SN.ITK.DEFC.ZS | Prevalence of undernourishment (% of population)
+SN.ITK.SALT.ZS | Consumption of iodized salt (% of households)
+SN.ITK.VITA.ZS | Vitamin A supplementation coverage rate (% of children ages 6-59 months)
+SP.ADO.TFRT | Adolescent fertility rate (births per 1,000 women ages 15-19)
+SP.DYN.AMRT.FE | Mortality rate, adult, female (per 1,000 female adults)
+SP.DYN.AMRT.MA | Mortality rate, adult, male (per 1,000 male adults)
+SP.DYN.CBRT.IN | Birth rate, crude (per 1,000 people)
+SP.DYN.CDRT.IN | Death rate, crude (per 1,000 people)
+SP.DYN.CONU.ZS | Contraceptive prevalence (% of women ages 15-49)
+SP.DYN.IMRT.FE.IN | Mortality rate, infant, female (per 1,000 live births)
+SP.DYN.IMRT.IN | Mortality rate, infant (per 1,000 live births)
+SP.DYN.IMRT.MA.IN | Mortality rate, infant, male (per 1,000 live births)
+SP.DYN.LE00.FE.IN | Life expectancy at birth, female (years)
+SP.DYN.LE00.IN | Life expectancy at birth, total (years)
+SP.DYN.LE00.MA.IN | Life expectancy at birth, male (years)
+SP.DYN.SMAM.FE | Mean age at first marriage, female
+SP.DYN.SMAM.MA | Mean age at first marriage, male
+SP.DYN.TFRT.IN | Fertility rate, total (births per woman)
+SP.DYN.TO65.FE.ZS | Survival to age 65, female (% of cohort)
+SP.DYN.TO65.MA.ZS | Survival to age 65, male (% of cohort)
+SP.DYN.WFRT | Wanted fertility rate (births per woman)
+SP.HOU.FEMA.ZS | Female headed households (% of households with a female head)
+SP.MTR.1519.ZS | Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)
+SP.POP.0004.FE | Population ages 0-4, female
+SP.POP.0004.FE.5Y | Population ages 0-4, female (% of female population)
+SP.POP.0004.MA | Population ages 0-4, male
+SP.POP.0004.MA.5Y | Population ages 0-4, male (% of male population)
+SP.POP.0014.FE.ZS | Population ages 0-14, female (% of total)
+SP.POP.0014.MA.ZS | Population ages 0-14, male (% of total)
+SP.POP.0014.TO | Population ages 0-14, total
+SP.POP.0014.TO.ZS | Population ages 0-14 (% of total)
+SP.POP.0509.FE | Population ages 5-9, female
+SP.POP.0509.FE.5Y | Population ages 5-9, female (% of female population)
+SP.POP.0509.MA | Population ages 5-9, male
+SP.POP.0509.MA.5Y | Population ages 5-9, male (% of male population)
+SP.POP.1014.FE | Population ages 10-14, female
+SP.POP.1014.FE.5Y | Population ages 10-14, female (% of female population)
+SP.POP.1014.MA | Population ages 10-14, male
+SP.POP.1014.MA.5Y | Population ages 10-14, male (% of male population)
+SP.POP.1519.FE | Population ages 15-19, female
+SP.POP.1519.FE.5Y | Population ages 15-19, female (% of female population)
+SP.POP.1519.MA | Population ages 15-19, male
+SP.POP.1519.MA.5Y | Population ages 15-19, male (% of male population)
+SP.POP.1564.FE.ZS | Population ages 15-64, female (% of total)
+SP.POP.1564.MA.ZS | Population ages 15-64, male (% of total)
+SP.POP.1564.TO | Population ages 15-64, total
+SP.POP.1564.TO.ZS | Population ages 15-64 (% of total)
+SP.POP.2024.FE | Population ages 20-24, female
+SP.POP.2024.FE.5Y | Population ages 20-24, female (% of female population)
+SP.POP.2024.MA | Population ages 20-24, male
+SP.POP.2024.MA.5Y | Population ages 20-24, male (% of male population)
+SP.POP.2529.FE | Population ages 25-29, female
+SP.POP.2529.FE.5Y | Population ages 25-29, female (% of female population)
+SP.POP.2529.MA | Population ages 25-29, male
+SP.POP.2529.MA.5Y | Population ages 25-29, male (% of male population)
+SP.POP.3034.FE | Population ages 30-34, female
+SP.POP.3034.FE.5Y | Population ages 30-34, female (% of female population)
+SP.POP.3034.MA | Population ages 30-34, male
+SP.POP.3034.MA.5Y | Population ages 30-34, male (% of male population)
+SP.POP.3539.FE | Population ages 35-39, female
+SP.POP.3539.FE.5Y | Population ages 35-39, female (% of female population)
+SP.POP.3539.MA | Population ages 35-39, male
+SP.POP.3539.MA.5Y | Population ages 35-39, male (% of male population)
+SP.POP.4044.FE | Population ages 40-44, female
+SP.POP.4044.FE.5Y | Population ages 40-44, female (% of female population)
+SP.POP.4044.MA | Population ages 40-44, male
+SP.POP.4044.MA.5Y | Population ages 40-44, male (% of male population)
+SP.POP.4549.FE | Population ages 45-49, female
+SP.POP.4549.FE.5Y | Population ages 45-49, female (% of female population)
+SP.POP.4549.MA | Population ages 45-49, male
+SP.POP.4549.MA.5Y | Population ages 45-49, male (% of male population)
+SP.POP.5054.FE | Population ages 50-54, female
+SP.POP.5054.FE.5Y | Population ages 50-54, female (% of female population)
+SP.POP.5054.MA | Population ages 50-54, male
+SP.POP.5054.MA.5Y | Population ages 50-54, male (% of male population)
+SP.POP.5559.FE | Population ages 55-59, female
+SP.POP.5559.FE.5Y | Population ages 55-59, female (% of female population)
+SP.POP.5559.MA | Population ages 55-59, male
+SP.POP.5559.MA.5Y | Population ages 55-59, male (% of male population)
+SP.POP.6064.FE | Population ages 60-64, female
+SP.POP.6064.FE.5Y | Population ages 60-64, female (% of female population)
+SP.POP.6064.MA | Population ages 60-64, male
+SP.POP.6064.MA.5Y | Population ages 60-64, male (% of male population)
+SP.POP.6569.FE | Population ages 65-69, female
+SP.POP.6569.FE.5Y | Population ages 65-69, female (% of female population)
+SP.POP.6569.MA | Population ages 65-69, male
+SP.POP.6569.MA.5Y | Population ages 65-69, male (% of male population)
+SP.POP.65UP.FE.ZS | Population ages 65 and above, female (% of total)
+SP.POP.65UP.MA.ZS | Population ages 65 and above, male (% of total)
+SP.POP.65UP.TO | Population ages 65 and above, total
+SP.POP.65UP.TO.ZS | Population ages 65 and above (% of total)
+SP.POP.7074.FE | Population ages 70-74, female
+SP.POP.7074.FE.5Y | Population ages 70-74, female (% of female population)
+SP.POP.7074.MA | Population ages 70-74, male
+SP.POP.7074.MA.5Y | Population ages 70-74, male (% of male population)
+SP.POP.7579.FE | Population ages 75-79, female
+SP.POP.7579.FE.5Y | Population ages 75-79, female (% of female population)
+SP.POP.7579.MA | Population ages 75-79, male
+SP.POP.7579.MA.5Y | Population ages 75-79, male (% of male population)
+SP.POP.80UP.FE | Population ages 80 and above, female
+SP.POP.80UP.FE.5Y | Population ages 80 and above, female (% of female population)
+SP.POP.80UP.MA | Population ages 80 and above, male
+SP.POP.80UP.MA.5Y | Population ages 80 and above, male (% of male population)
+SP.POP.AG00.FE.IN | Age population, age 0, female, interpolated
+SP.POP.AG00.MA.IN | Age population, age 0, male, interpolated
+SP.POP.AG01.FE.IN | Age population, age 01, female, interpolated
+SP.POP.AG01.MA.IN | Age population, age 01, male, interpolated
+SP.POP.AG02.FE.IN | Age population, age 02, female, interpolated
+SP.POP.AG02.MA.IN | Age population, age 02, male, interpolated
+SP.POP.AG03.FE.IN | Age population, age 03, female, interpolated
+SP.POP.AG03.MA.IN | Age population, age 03, male, interpolated
+SP.POP.AG04.FE.IN | Age population, age 04, female, interpolated
+SP.POP.AG04.MA.IN | Age population, age 04, male, interpolated
+SP.POP.AG05.FE.IN | Age population, age 05, female, interpolated
+SP.POP.AG05.MA.IN | Age population, age 05, male, interpolated
+SP.POP.AG06.FE.IN | Age population, age 06, female, interpolated
+SP.POP.AG06.MA.IN | Age population, age 06, male, interpolated
+SP.POP.AG07.FE.IN | Age population, age 07, female, interpolated
+SP.POP.AG07.MA.IN | Age population, age 07, male, interpolated
+SP.POP.AG08.FE.IN | Age population, age 08, female, interpolated
+SP.POP.AG08.MA.IN | Age population, age 08, male, interpolated
+SP.POP.AG09.FE.IN | Age population, age 09, female, interpolated
+SP.POP.AG09.MA.IN | Age population, age 09, male, interpolated
+SP.POP.AG10.FE.IN | Age population, age 10, female, interpolated
+SP.POP.AG10.MA.IN | Age population, age 10, male
+SP.POP.AG11.FE.IN | Age population, age 11, female, interpolated
+SP.POP.AG11.MA.IN | Age population, age 11, male
+SP.POP.AG12.FE.IN | Age population, age 12, female, interpolated
+SP.POP.AG12.MA.IN | Age population, age 12, male
+SP.POP.AG13.FE.IN | Age population, age 13, female, interpolated
+SP.POP.AG13.MA.IN | Age population, age 13, male
+SP.POP.AG14.FE.IN | Age population, age 14, female, interpolated
+SP.POP.AG14.MA.IN | Age population, age 14, male
+SP.POP.AG15.FE.IN | Age population, age 15, female, interpolated
+SP.POP.AG15.MA.IN | Age population, age 15, male, interpolated
+SP.POP.AG16.FE.IN | Age population, age 16, female, interpolated
+SP.POP.AG16.MA.IN | Age population, age 16, male, interpolated
+SP.POP.AG17.FE.IN | Age population, age 17, female, interpolated
+SP.POP.AG17.MA.IN | Age population, age 17, male, interpolated
+SP.POP.AG18.FE.IN | Age population, age 18, female, interpolated
+SP.POP.AG18.MA.IN | Age population, age 18, male, interpolated
+SP.POP.AG19.FE.IN | Age population, age 19, female, interpolated
+SP.POP.AG19.MA.IN | Age population, age 19, male, interpolated
+SP.POP.AG20.FE.IN | Age population, age 20, female, interpolated
+SP.POP.AG20.MA.IN | Age population, age 20, male, interpolated
+SP.POP.AG21.FE.IN | Age population, age 21, female, interpolated
+SP.POP.AG21.MA.IN | Age population, age 21, male, interpolated
+SP.POP.AG22.FE.IN | Age population, age 22, female, interpolated
+SP.POP.AG22.MA.IN | Age population, age 22, male, interpolated
+SP.POP.AG23.FE.IN | Age population, age 23, female, interpolated
+SP.POP.AG23.MA.IN | Age population, age 23, male, interpolated
+SP.POP.AG24.FE.IN | Age population, age 24, female, interpolated
+SP.POP.AG24.MA.IN | Age population, age 24, male, interpolated
+SP.POP.AG25.FE.IN | Age population, age 25, female, interpolated
+SP.POP.AG25.MA.IN | Age population, age 25, male, interpolated
+SP.POP.BRTH.MF | Sex ratio at birth (male births per female births)
+SP.POP.DPND | Age dependency ratio (% of working-age population)
+SP.POP.DPND.OL | Age dependency ratio, old (% of working-age population)
+SP.POP.DPND.YG | Age dependency ratio, young (% of working-age population)
+SP.POP.GROW | Population growth (annual %)
+SP.POP.TOTL | Population, total
+SP.POP.TOTL.FE.IN | Population, female
+SP.POP.TOTL.FE.ZS | Population, female (% of total)
+SP.POP.TOTL.MA.IN | Population, male
+SP.POP.TOTL.MA.ZS | Population, male (% of total)
+SP.REG.BRTH.RU.ZS | Completeness of birth registration, rural (%)
+SP.REG.BRTH.UR.ZS | Completeness of birth registration, urban (%)
+SP.REG.BRTH.ZS | Completeness of birth registration (%)
+SP.REG.DTHS.ZS | Completeness of death registration with cause-of-death information (%)
+SP.RUR.TOTL | Rural population
+SP.RUR.TOTL.ZG | Rural population growth (annual %)
+SP.RUR.TOTL.ZS | Rural population (% of total population)
+SP.URB.GROW | Urban population growth (annual %)
+SP.URB.TOTL | Urban population
+SP.URB.TOTL.IN.ZS | Urban population (% of total)
+SP.UWT.TFRT | Unmet need for contraception (% of married women ages 15-49)

+ 2505 - 0
data/purposeCombined/BI/examples/countries.py

@@ -0,0 +1,2505 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This module contains data related to countries and is used for geo mapping"""
+from typing import Any, Dict, List, Optional
+
+countries: List[Dict[str, Any]] = [
+    {
+        "name": "Angola",
+        "area": 1246700,
+        "cioc": "ANG",
+        "cca2": "AO",
+        "capital": "Luanda",
+        "lat": -12.5,
+        "lng": 18.5,
+        "cca3": "AGO",
+    },
+    {
+        "name": "Algeria",
+        "area": 2381741,
+        "cioc": "ALG",
+        "cca2": "DZ",
+        "capital": "Algiers",
+        "lat": 28,
+        "lng": 3,
+        "cca3": "DZA",
+    },
+    {
+        "name": "Egypt",
+        "area": 1002450,
+        "cioc": "EGY",
+        "cca2": "EG",
+        "capital": "Cairo",
+        "lat": 27,
+        "lng": 30,
+        "cca3": "EGY",
+    },
+    {
+        "name": "Bangladesh",
+        "area": 147570,
+        "cioc": "BAN",
+        "cca2": "BD",
+        "capital": "Dhaka",
+        "lat": 24,
+        "lng": 90,
+        "cca3": "BGD",
+    },
+    {
+        "name": "Niger",
+        "area": 1267000,
+        "cioc": "NIG",
+        "cca2": "NE",
+        "capital": "Niamey",
+        "lat": 16,
+        "lng": 8,
+        "cca3": "NER",
+    },
+    {
+        "name": "Liechtenstein",
+        "area": 160,
+        "cioc": "LIE",
+        "cca2": "LI",
+        "capital": "Vaduz",
+        "lat": 47.26666666,
+        "lng": 9.53333333,
+        "cca3": "LIE",
+    },
+    {
+        "name": "Namibia",
+        "area": 825615,
+        "cioc": "NAM",
+        "cca2": "NA",
+        "capital": "Windhoek",
+        "lat": -22,
+        "lng": 17,
+        "cca3": "NAM",
+    },
+    {
+        "name": "Bulgaria",
+        "area": 110879,
+        "cioc": "BUL",
+        "cca2": "BG",
+        "capital": "Sofia",
+        "lat": 43,
+        "lng": 25,
+        "cca3": "BGR",
+    },
+    {
+        "name": "Bolivia",
+        "area": 1098581,
+        "cioc": "BOL",
+        "cca2": "BO",
+        "capital": "Sucre",
+        "lat": -17,
+        "lng": -65,
+        "cca3": "BOL",
+    },
+    {
+        "name": "Ghana",
+        "area": 238533,
+        "cioc": "GHA",
+        "cca2": "GH",
+        "capital": "Accra",
+        "lat": 8,
+        "lng": -2,
+        "cca3": "GHA",
+    },
+    {
+        "name": "Cocos (Keeling) Islands",
+        "area": 14,
+        "cioc": "",
+        "cca2": "CC",
+        "capital": "West Island",
+        "lat": -12.5,
+        "lng": 96.83333333,
+        "cca3": "CCK",
+    },
+    {
+        "name": "Pakistan",
+        "area": 881912,
+        "cioc": "PAK",
+        "cca2": "PK",
+        "capital": "Islamabad",
+        "lat": 30,
+        "lng": 70,
+        "cca3": "PAK",
+    },
+    {
+        "name": "Cape Verde",
+        "area": 4033,
+        "cioc": "CPV",
+        "cca2": "CV",
+        "capital": "Praia",
+        "lat": 16,
+        "lng": -24,
+        "cca3": "CPV",
+    },
+    {
+        "name": "Jordan",
+        "area": 89342,
+        "cioc": "JOR",
+        "cca2": "JO",
+        "capital": "Amman",
+        "lat": 31,
+        "lng": 36,
+        "cca3": "JOR",
+    },
+    {
+        "name": "Liberia",
+        "area": 111369,
+        "cioc": "LBR",
+        "cca2": "LR",
+        "capital": "Monrovia",
+        "lat": 6.5,
+        "lng": -9.5,
+        "cca3": "LBR",
+    },
+    {
+        "name": "Libya",
+        "area": 1759540,
+        "cioc": "LBA",
+        "cca2": "LY",
+        "capital": "Tripoli",
+        "lat": 25,
+        "lng": 17,
+        "cca3": "LBY",
+    },
+    {
+        "name": "Malaysia",
+        "area": 330803,
+        "cioc": "MAS",
+        "cca2": "MY",
+        "capital": "Kuala Lumpur",
+        "lat": 2.5,
+        "lng": 112.5,
+        "cca3": "MYS",
+    },
+    {
+        "name": "Dominican Republic",
+        "area": 48671,
+        "cioc": "DOM",
+        "cca2": "DO",
+        "capital": "Santo Domingo",
+        "lat": 19,
+        "lng": -70.66666666,
+        "cca3": "DOM",
+    },
+    {
+        "name": "Puerto Rico",
+        "area": 8870,
+        "cioc": "PUR",
+        "cca2": "PR",
+        "capital": "San Juan",
+        "lat": 18.25,
+        "lng": -66.5,
+        "cca3": "PRI",
+    },
+    {
+        "name": "Mayotte",
+        "area": 374,
+        "cioc": "",
+        "cca2": "YT",
+        "capital": "Mamoudzou",
+        "lat": -12.83333333,
+        "lng": 45.16666666,
+        "cca3": "MYT",
+    },
+    {
+        "name": "North Korea",
+        "area": 120538,
+        "cioc": "PRK",
+        "cca2": "KP",
+        "capital": "Pyongyang",
+        "lat": 40,
+        "lng": 127,
+        "cca3": "PRK",
+    },
+    {
+        "name": "Palestine",
+        "area": 6220,
+        "cioc": "PLE",
+        "cca2": "PS",
+        "capital": "Ramallah",
+        "lat": 31.9,
+        "lng": 35.2,
+        "cca3": "PSE",
+    },
+    {
+        "name": "Tanzania",
+        "area": 945087,
+        "cioc": "TAN",
+        "cca2": "TZ",
+        "capital": "Dodoma",
+        "lat": -6,
+        "lng": 35,
+        "cca3": "TZA",
+    },
+    {
+        "name": "Botswana",
+        "area": 582000,
+        "cioc": "BOT",
+        "cca2": "BW",
+        "capital": "Gaborone",
+        "lat": -22,
+        "lng": 24,
+        "cca3": "BWA",
+    },
+    {
+        "name": "Cambodia",
+        "area": 181035,
+        "cioc": "CAM",
+        "cca2": "KH",
+        "capital": "Phnom Penh",
+        "lat": 13,
+        "lng": 105,
+        "cca3": "KHM",
+    },
+    {
+        "name": "Nicaragua",
+        "area": 130373,
+        "cioc": "NCA",
+        "cca2": "NI",
+        "capital": "Managua",
+        "lat": 13,
+        "lng": -85,
+        "cca3": "NIC",
+    },
+    {
+        "name": "Trinidad and Tobago",
+        "area": 5130,
+        "cioc": "TTO",
+        "cca2": "TT",
+        "capital": "Port of Spain",
+        "lat": 11,
+        "lng": -61,
+        "cca3": "TTO",
+    },
+    {
+        "name": "Ethiopia",
+        "area": 1104300,
+        "cioc": "ETH",
+        "cca2": "ET",
+        "capital": "Addis Ababa",
+        "lat": 8,
+        "lng": 38,
+        "cca3": "ETH",
+    },
+    {
+        "name": "Paraguay",
+        "area": 406752,
+        "cioc": "PAR",
+        "cca2": "PY",
+        "capital": "Asuncion",
+        "lat": -23,
+        "lng": -58,
+        "cca3": "PRY",
+    },
+    {
+        "name": "Hong Kong",
+        "area": 1104,
+        "cioc": "HKG",
+        "cca2": "HK",
+        "capital": "City of Victoria",
+        "lat": 22.267,
+        "lng": 114.188,
+        "cca3": "HKG",
+    },
+    {
+        "name": "Saudi Arabia",
+        "area": 2149690,
+        "cioc": "KSA",
+        "cca2": "SA",
+        "capital": "Riyadh",
+        "lat": 25,
+        "lng": 45,
+        "cca3": "SAU",
+    },
+    {
+        "name": "Lebanon",
+        "area": 10452,
+        "cioc": "LIB",
+        "cca2": "LB",
+        "capital": "Beirut",
+        "lat": 33.83333333,
+        "lng": 35.83333333,
+        "cca3": "LBN",
+    },
+    {
+        "name": "Slovenia",
+        "area": 20273,
+        "cioc": "SLO",
+        "cca2": "SI",
+        "capital": "Ljubljana",
+        "lat": 46.11666666,
+        "lng": 14.81666666,
+        "cca3": "SVN",
+    },
+    {
+        "name": "Burkina Faso",
+        "area": 272967,
+        "cioc": "BUR",
+        "cca2": "BF",
+        "capital": "Ouagadougou",
+        "lat": 13,
+        "lng": -2,
+        "cca3": "BFA",
+    },
+    {
+        "name": "Switzerland",
+        "area": 41284,
+        "cioc": "SUI",
+        "cca2": "CH",
+        "capital": "Bern",
+        "lat": 47,
+        "lng": 8,
+        "cca3": "CHE",
+    },
+    {
+        "name": "Mauritania",
+        "area": 1030700,
+        "cioc": "MTN",
+        "cca2": "MR",
+        "capital": "Nouakchott",
+        "lat": 20,
+        "lng": -12,
+        "cca3": "MRT",
+    },
+    {
+        "name": "Croatia",
+        "area": 56594,
+        "cioc": "CRO",
+        "cca2": "HR",
+        "capital": "Zagreb",
+        "lat": 45.16666666,
+        "lng": 15.5,
+        "cca3": "HRV",
+    },
+    {
+        "name": "Chile",
+        "area": 756102,
+        "cioc": "CHI",
+        "cca2": "CL",
+        "capital": "Santiago",
+        "lat": -30,
+        "lng": -71,
+        "cca3": "CHL",
+    },
+    {
+        "name": "China",
+        "area": 9706961,
+        "cioc": "CHN",
+        "cca2": "CN",
+        "capital": "Beijing",
+        "lat": 35,
+        "lng": 105,
+        "cca3": "CHN",
+    },
+    {
+        "name": "Saint Kitts and Nevis",
+        "area": 261,
+        "cioc": "SKN",
+        "cca2": "KN",
+        "capital": "Basseterre",
+        "lat": 17.33333333,
+        "lng": -62.75,
+        "cca3": "KNA",
+    },
+    {
+        "name": "Sierra Leone",
+        "area": 71740,
+        "cioc": "SLE",
+        "cca2": "SL",
+        "capital": "Freetown",
+        "lat": 8.5,
+        "lng": -11.5,
+        "cca3": "SLE",
+    },
+    {
+        "name": "Jamaica",
+        "area": 10991,
+        "cioc": "JAM",
+        "cca2": "JM",
+        "capital": "Kingston",
+        "lat": 18.25,
+        "lng": -77.5,
+        "cca3": "JAM",
+    },
+    {
+        "name": "San Marino",
+        "area": 61,
+        "cioc": "SMR",
+        "cca2": "SM",
+        "capital": "City of San Marino",
+        "lat": 43.76666666,
+        "lng": 12.41666666,
+        "cca3": "SMR",
+    },
+    {
+        "name": "Gibraltar",
+        "area": 6,
+        "cioc": "",
+        "cca2": "GI",
+        "capital": "Gibraltar",
+        "lat": 36.13333333,
+        "lng": -5.35,
+        "cca3": "GIB",
+    },
+    {
+        "name": "Djibouti",
+        "area": 23200,
+        "cioc": "DJI",
+        "cca2": "DJ",
+        "capital": "Djibouti",
+        "lat": 11.5,
+        "lng": 43,
+        "cca3": "DJI",
+    },
+    {
+        "name": "Guinea",
+        "area": 245857,
+        "cioc": "GUI",
+        "cca2": "GN",
+        "capital": "Conakry",
+        "lat": 11,
+        "lng": -10,
+        "cca3": "GIN",
+    },
+    {
+        "name": "Finland",
+        "area": 338424,
+        "cioc": "FIN",
+        "cca2": "FI",
+        "capital": "Helsinki",
+        "lat": 64,
+        "lng": 26,
+        "cca3": "FIN",
+    },
+    {
+        "name": "Uruguay",
+        "area": 181034,
+        "cioc": "URU",
+        "cca2": "UY",
+        "capital": "Montevideo",
+        "lat": -33,
+        "lng": -56,
+        "cca3": "URY",
+    },
+    {
+        "name": "Thailand",
+        "area": 513120,
+        "cioc": "THA",
+        "cca2": "TH",
+        "capital": "Bangkok",
+        "lat": 15,
+        "lng": 100,
+        "cca3": "THA",
+    },
+    {
+        "name": "Sao Tome and Principe",
+        "area": 964,
+        "cioc": "STP",
+        "cca2": "ST",
+        "capital": "Sao Tome",
+        "lat": 1,
+        "lng": 7,
+        "cca3": "STP",
+    },
+    {
+        "name": "Seychelles",
+        "area": 452,
+        "cioc": "SEY",
+        "cca2": "SC",
+        "capital": "Victoria",
+        "lat": -4.58333333,
+        "lng": 55.66666666,
+        "cca3": "SYC",
+    },
+    {
+        "name": "Nepal",
+        "area": 147181,
+        "cioc": "NEP",
+        "cca2": "NP",
+        "capital": "Kathmandu",
+        "lat": 28,
+        "lng": 84,
+        "cca3": "NPL",
+    },
+    {
+        "name": "Christmas Island",
+        "area": 135,
+        "cioc": "",
+        "cca2": "CX",
+        "capital": "Flying Fish Cove",
+        "lat": -10.5,
+        "lng": 105.66666666,
+        "cca3": "CXR",
+    },
+    {
+        "name": "Laos",
+        "area": 236800,
+        "cioc": "LAO",
+        "cca2": "LA",
+        "capital": "Vientiane",
+        "lat": 18,
+        "lng": 105,
+        "cca3": "LAO",
+    },
+    {
+        "name": "Yemen",
+        "area": 527968,
+        "cioc": "YEM",
+        "cca2": "YE",
+        "capital": "Sana'a",
+        "lat": 15,
+        "lng": 48,
+        "cca3": "YEM",
+    },
+    {
+        "name": "Bouvet Island",
+        "area": 49,
+        "cioc": "",
+        "cca2": "BV",
+        "capital": "",
+        "lat": -54.43333333,
+        "lng": 3.4,
+        "cca3": "BVT",
+    },
+    {
+        "name": "South Africa",
+        "area": 1221037,
+        "cioc": "RSA",
+        "cca2": "ZA",
+        "capital": "Pretoria",
+        "lat": -29,
+        "lng": 24,
+        "cca3": "ZAF",
+    },
+    {
+        "name": "Kiribati",
+        "area": 811,
+        "cioc": "KIR",
+        "cca2": "KI",
+        "capital": "South Tarawa",
+        "lat": 1.41666666,
+        "lng": 173,
+        "cca3": "KIR",
+    },
+    {
+        "name": "Philippines",
+        "area": 342353,
+        "cioc": "PHI",
+        "cca2": "PH",
+        "capital": "Manila",
+        "lat": 13,
+        "lng": 122,
+        "cca3": "PHL",
+    },
+    {
+        "name": "Sint Maarten",
+        "area": 34,
+        "cioc": "",
+        "cca2": "SX",
+        "capital": "Philipsburg",
+        "lat": 18.033333,
+        "lng": -63.05,
+        "cca3": "SXM",
+    },
+    {
+        "name": "Romania",
+        "area": 238391,
+        "cioc": "ROU",
+        "cca2": "RO",
+        "capital": "Bucharest",
+        "lat": 46,
+        "lng": 25,
+        "cca3": "ROU",
+    },
+    {
+        "name": "United States Virgin Islands",
+        "area": 347,
+        "cioc": "ISV",
+        "cca2": "VI",
+        "capital": "Charlotte Amalie",
+        "lat": 18.35,
+        "lng": -64.933333,
+        "cca3": "VIR",
+    },
+    {
+        "name": "Syria",
+        "area": 185180,
+        "cioc": "SYR",
+        "cca2": "SY",
+        "capital": "Damascus",
+        "lat": 35,
+        "lng": 38,
+        "cca3": "SYR",
+    },
+    {
+        "name": "Macau",
+        "area": 30,
+        "cioc": "",
+        "cca2": "MO",
+        "capital": "",
+        "lat": 22.16666666,
+        "lng": 113.55,
+        "cca3": "MAC",
+    },
+    {
+        "name": "Saint Martin",
+        "area": 53,
+        "cioc": "",
+        "cca2": "MF",
+        "capital": "Marigot",
+        "lat": 18.08333333,
+        "lng": -63.95,
+        "cca3": "MAF",
+    },
+    {
+        "name": "Malta",
+        "area": 316,
+        "cioc": "MLT",
+        "cca2": "MT",
+        "capital": "Valletta",
+        "lat": 35.83333333,
+        "lng": 14.58333333,
+        "cca3": "MLT",
+    },
+    {
+        "name": "Kazakhstan",
+        "area": 2724900,
+        "cioc": "KAZ",
+        "cca2": "KZ",
+        "capital": "Astana",
+        "lat": 48,
+        "lng": 68,
+        "cca3": "KAZ",
+    },
+    {
+        "name": "Turks and Caicos Islands",
+        "area": 948,
+        "cioc": "",
+        "cca2": "TC",
+        "capital": "Cockburn Town",
+        "lat": 21.75,
+        "lng": -71.58333333,
+        "cca3": "TCA",
+    },
+    {
+        "name": "French Polynesia",
+        "area": 4167,
+        "cioc": "",
+        "cca2": "PF",
+        "capital": "Papeete",
+        "lat": -15,
+        "lng": -140,
+        "cca3": "PYF",
+    },
+    {
+        "name": "Niue",
+        "area": 260,
+        "cioc": "",
+        "cca2": "NU",
+        "capital": "Alofi",
+        "lat": -19.03333333,
+        "lng": -169.86666666,
+        "cca3": "NIU",
+    },
+    {
+        "name": "Dominica",
+        "area": 751,
+        "cioc": "DMA",
+        "cca2": "DM",
+        "capital": "Roseau",
+        "lat": 15.41666666,
+        "lng": -61.33333333,
+        "cca3": "DMA",
+    },
+    {
+        "name": "Benin",
+        "area": 112622,
+        "cioc": "BEN",
+        "cca2": "BJ",
+        "capital": "Porto-Novo",
+        "lat": 9.5,
+        "lng": 2.25,
+        "cca3": "BEN",
+    },
+    {
+        "name": "French Guiana",
+        "area": 83534,
+        "cioc": "",
+        "cca2": "GF",
+        "capital": "Cayenne",
+        "lat": 4,
+        "lng": -53,
+        "cca3": "GUF",
+    },
+    {
+        "name": "Belgium",
+        "area": 30528,
+        "cioc": "BEL",
+        "cca2": "BE",
+        "capital": "Brussels",
+        "lat": 50.83333333,
+        "lng": 4,
+        "cca3": "BEL",
+    },
+    {
+        "name": "Montserrat",
+        "area": 102,
+        "cioc": "",
+        "cca2": "MS",
+        "capital": "Plymouth",
+        "lat": 16.75,
+        "lng": -62.2,
+        "cca3": "MSR",
+    },
+    {
+        "name": "Togo",
+        "area": 56785,
+        "cioc": "TOG",
+        "cca2": "TG",
+        "capital": "Lome",
+        "lat": 8,
+        "lng": 1.16666666,
+        "cca3": "TGO",
+    },
+    {
+        "name": "Germany",
+        "area": 357114,
+        "cioc": "GER",
+        "cca2": "DE",
+        "capital": "Berlin",
+        "lat": 51,
+        "lng": 9,
+        "cca3": "DEU",
+    },
+    {
+        "name": "Guam",
+        "area": 549,
+        "cioc": "GUM",
+        "cca2": "GU",
+        "capital": "Hagatna",
+        "lat": 13.46666666,
+        "lng": 144.78333333,
+        "cca3": "GUM",
+    },
+    {
+        "name": "Sri Lanka",
+        "area": 65610,
+        "cioc": "SRI",
+        "cca2": "LK",
+        "capital": "Colombo",
+        "lat": 7,
+        "lng": 81,
+        "cca3": "LKA",
+    },
+    {
+        "name": "South Sudan",
+        "area": 619745,
+        "cioc": "",
+        "cca2": "SS",
+        "capital": "Juba",
+        "lat": 7,
+        "lng": 30,
+        "cca3": "SSD",
+    },
+    {
+        "name": "Falkland Islands",
+        "area": 12173,
+        "cioc": "",
+        "cca2": "FK",
+        "capital": "Stanley",
+        "lat": -51.75,
+        "lng": -59,
+        "cca3": "FLK",
+    },
+    {
+        "name": "United Kingdom",
+        "area": 242900,
+        "cioc": "GBR",
+        "cca2": "GB",
+        "capital": "London",
+        "lat": 54,
+        "lng": -2,
+        "cca3": "GBR",
+    },
+    {
+        "name": "Guyana",
+        "area": 214969,
+        "cioc": "GUY",
+        "cca2": "GY",
+        "capital": "Georgetown",
+        "lat": 5,
+        "lng": -59,
+        "cca3": "GUY",
+    },
+    {
+        "name": "Costa Rica",
+        "area": 51100,
+        "cioc": "CRC",
+        "cca2": "CR",
+        "capital": "San Jose",
+        "lat": 10,
+        "lng": -84,
+        "cca3": "CRI",
+    },
+    {
+        "name": "Cameroon",
+        "area": 475442,
+        "cioc": "CMR",
+        "cca2": "CM",
+        "capital": "Yaounde",
+        "lat": 6,
+        "lng": 12,
+        "cca3": "CMR",
+    },
+    {
+        "name": "Morocco",
+        "area": 446550,
+        "cioc": "MAR",
+        "cca2": "MA",
+        "capital": "Rabat",
+        "lat": 32,
+        "lng": -5,
+        "cca3": "MAR",
+    },
+    {
+        "name": "Northern Mariana Islands",
+        "area": 464,
+        "cioc": "",
+        "cca2": "MP",
+        "capital": "Saipan",
+        "lat": 15.2,
+        "lng": 145.75,
+        "cca3": "MNP",
+    },
+    {
+        "name": "Lesotho",
+        "area": 30355,
+        "cioc": "LES",
+        "cca2": "LS",
+        "capital": "Maseru",
+        "lat": -29.5,
+        "lng": 28.5,
+        "cca3": "LSO",
+    },
+    {
+        "name": "Hungary",
+        "area": 93028,
+        "cioc": "HUN",
+        "cca2": "HU",
+        "capital": "Budapest",
+        "lat": 47,
+        "lng": 20,
+        "cca3": "HUN",
+    },
+    {
+        "name": "Turkmenistan",
+        "area": 488100,
+        "cioc": "TKM",
+        "cca2": "TM",
+        "capital": "Ashgabat",
+        "lat": 40,
+        "lng": 60,
+        "cca3": "TKM",
+    },
+    {
+        "name": "Suriname",
+        "area": 163820,
+        "cioc": "SUR",
+        "cca2": "SR",
+        "capital": "Paramaribo",
+        "lat": 4,
+        "lng": -56,
+        "cca3": "SUR",
+    },
+    {
+        "name": "Netherlands",
+        "area": 41850,
+        "cioc": "NED",
+        "cca2": "NL",
+        "capital": "Amsterdam",
+        "lat": 52.5,
+        "lng": 5.75,
+        "cca3": "NLD",
+    },
+    {
+        "name": "Bermuda",
+        "area": 54,
+        "cioc": "BER",
+        "cca2": "BM",
+        "capital": "Hamilton",
+        "lat": 32.33333333,
+        "lng": -64.75,
+        "cca3": "BMU",
+    },
+    {
+        "name": "Heard Island and McDonald Islands",
+        "area": 412,
+        "cioc": "",
+        "cca2": "HM",
+        "capital": "",
+        "lat": -53.1,
+        "lng": 72.51666666,
+        "cca3": "HMD",
+    },
+    {
+        "name": "Chad",
+        "area": 1284000,
+        "cioc": "CHA",
+        "cca2": "TD",
+        "capital": "N'Djamena",
+        "lat": 15,
+        "lng": 19,
+        "cca3": "TCD",
+    },
+    {
+        "name": "Georgia",
+        "area": 69700,
+        "cioc": "GEO",
+        "cca2": "GE",
+        "capital": "Tbilisi",
+        "lat": 42,
+        "lng": 43.5,
+        "cca3": "GEO",
+    },
+    {
+        "name": "Montenegro",
+        "area": 13812,
+        "cioc": "MNE",
+        "cca2": "ME",
+        "capital": "Podgorica",
+        "lat": 42.5,
+        "lng": 19.3,
+        "cca3": "MNE",
+    },
+    {
+        "name": "Mongolia",
+        "area": 1564110,
+        "cioc": "MGL",
+        "cca2": "MN",
+        "capital": "Ulan Bator",
+        "lat": 46,
+        "lng": 105,
+        "cca3": "MNG",
+    },
+    {
+        "name": "Marshall Islands",
+        "area": 181,
+        "cioc": "MHL",
+        "cca2": "MH",
+        "capital": "Majuro",
+        "lat": 9,
+        "lng": 168,
+        "cca3": "MHL",
+    },
+    {
+        "name": "Martinique",
+        "area": 1128,
+        "cioc": "",
+        "cca2": "MQ",
+        "capital": "Fort-de-France",
+        "lat": 14.666667,
+        "lng": -61,
+        "cca3": "MTQ",
+    },
+    {
+        "name": "Belize",
+        "area": 22966,
+        "cioc": "BIZ",
+        "cca2": "BZ",
+        "capital": "Belmopan",
+        "lat": 17.25,
+        "lng": -88.75,
+        "cca3": "BLZ",
+    },
+    {
+        "name": "Norfolk Island",
+        "area": 36,
+        "cioc": "",
+        "cca2": "NF",
+        "capital": "Kingston",
+        "lat": -29.03333333,
+        "lng": 167.95,
+        "cca3": "NFK",
+    },
+    {
+        "name": "Myanmar",
+        "area": 676578,
+        "cioc": "MYA",
+        "cca2": "MM",
+        "capital": "Naypyidaw",
+        "lat": 22,
+        "lng": 98,
+        "cca3": "MMR",
+    },
+    {
+        "name": "Afghanistan",
+        "area": 652230,
+        "cioc": "AFG",
+        "cca2": "AF",
+        "capital": "Kabul",
+        "lat": 33,
+        "lng": 65,
+        "cca3": "AFG",
+    },
+    {
+        "name": "Burundi",
+        "area": 27834,
+        "cioc": "BDI",
+        "cca2": "BI",
+        "capital": "Bujumbura",
+        "lat": -3.5,
+        "lng": 30,
+        "cca3": "BDI",
+    },
+    {
+        "name": "British Virgin Islands",
+        "area": 151,
+        "cioc": "IVB",
+        "cca2": "VG",
+        "capital": "Road Town",
+        "lat": 18.431383,
+        "lng": -64.62305,
+        "cca3": "VGB",
+    },
+    {
+        "name": "Belarus",
+        "area": 207600,
+        "cioc": "BLR",
+        "cca2": "BY",
+        "capital": "Minsk",
+        "lat": 53,
+        "lng": 28,
+        "cca3": "BLR",
+    },
+    {
+        "name": "Saint Barthelemy",
+        "area": 21,
+        "cioc": "",
+        "cca2": "BL",
+        "capital": "Gustavia",
+        "lat": 18.5,
+        "lng": -63.41666666,
+        "cca3": "BLM",
+    },
+    {
+        "name": "Grenada",
+        "area": 344,
+        "cioc": "GRN",
+        "cca2": "GD",
+        "capital": "St. George's",
+        "lat": 12.11666666,
+        "lng": -61.66666666,
+        "cca3": "GRD",
+    },
+    {
+        "name": "Tokelau",
+        "area": 12,
+        "cioc": "",
+        "cca2": "TK",
+        "capital": "Fakaofo",
+        "lat": -9,
+        "lng": -172,
+        "cca3": "TKL",
+    },
+    {
+        "name": "Greece",
+        "area": 131990,
+        "cioc": "GRE",
+        "cca2": "GR",
+        "capital": "Athens",
+        "lat": 39,
+        "lng": 22,
+        "cca3": "GRC",
+    },
+    {
+        "name": "Russia",
+        "area": 17098242,
+        "cioc": "RUS",
+        "cca2": "RU",
+        "capital": "Moscow",
+        "lat": 60,
+        "lng": 100,
+        "cca3": "RUS",
+    },
+    {
+        "name": "Greenland",
+        "area": 2166086,
+        "cioc": "",
+        "cca2": "GL",
+        "capital": "Nuuk",
+        "lat": 72,
+        "lng": -40,
+        "cca3": "GRL",
+    },
+    {
+        "name": "Andorra",
+        "area": 468,
+        "cioc": "AND",
+        "cca2": "AD",
+        "capital": "Andorra la Vella",
+        "lat": 42.5,
+        "lng": 1.5,
+        "cca3": "AND",
+    },
+    {
+        "name": "Mozambique",
+        "area": 801590,
+        "cioc": "MOZ",
+        "cca2": "MZ",
+        "capital": "Maputo",
+        "lat": -18.25,
+        "lng": 35,
+        "cca3": "MOZ",
+    },
+    {
+        "name": "Tajikistan",
+        "area": 143100,
+        "cioc": "TJK",
+        "cca2": "TJ",
+        "capital": "Dushanbe",
+        "lat": 39,
+        "lng": 71,
+        "cca3": "TJK",
+    },
+    {
+        "name": "Haiti",
+        "area": 27750,
+        "cioc": "HAI",
+        "cca2": "HT",
+        "capital": "Port-au-Prince",
+        "lat": 19,
+        "lng": -72.41666666,
+        "cca3": "HTI",
+    },
+    {
+        "name": "Mexico",
+        "area": 1964375,
+        "cioc": "MEX",
+        "cca2": "MX",
+        "capital": "Mexico City",
+        "lat": 23,
+        "lng": -102,
+        "cca3": "MEX",
+    },
+    {
+        "name": "Zimbabwe",
+        "area": 390757,
+        "cioc": "ZIM",
+        "cca2": "ZW",
+        "capital": "Harare",
+        "lat": -20,
+        "lng": 30,
+        "cca3": "ZWE",
+    },
+    {
+        "name": "Saint Lucia",
+        "area": 616,
+        "cioc": "LCA",
+        "cca2": "LC",
+        "capital": "Castries",
+        "lat": 13.88333333,
+        "lng": -60.96666666,
+        "cca3": "LCA",
+    },
+    {
+        "name": "India",
+        "area": 3287590,
+        "cioc": "IND",
+        "cca2": "IN",
+        "capital": "New Delhi",
+        "lat": 20,
+        "lng": 77,
+        "cca3": "IND",
+    },
+    {
+        "name": "Latvia",
+        "area": 64559,
+        "cioc": "LAT",
+        "cca2": "LV",
+        "capital": "Riga",
+        "lat": 57,
+        "lng": 25,
+        "cca3": "LVA",
+    },
+    {
+        "name": "Bhutan",
+        "area": 38394,
+        "cioc": "BHU",
+        "cca2": "BT",
+        "capital": "Thimphu",
+        "lat": 27.5,
+        "lng": 90.5,
+        "cca3": "BTN",
+    },
+    {
+        "name": "Saint Vincent and the Grenadines",
+        "area": 389,
+        "cioc": "VIN",
+        "cca2": "VC",
+        "capital": "Kingstown",
+        "lat": 13.25,
+        "lng": -61.2,
+        "cca3": "VCT",
+    },
+    {
+        "name": "Vietnam",
+        "area": 331212,
+        "cioc": "VIE",
+        "cca2": "VN",
+        "capital": "Hanoi",
+        "lat": 16.16666666,
+        "lng": 107.83333333,
+        "cca3": "VNM",
+    },
+    {
+        "name": "Norway",
+        "area": 323802,
+        "cioc": "NOR",
+        "cca2": "NO",
+        "capital": "Oslo",
+        "lat": 62,
+        "lng": 10,
+        "cca3": "NOR",
+    },
+    {
+        "name": "Czech Republic",
+        "area": 78865,
+        "cioc": "CZE",
+        "cca2": "CZ",
+        "capital": "Prague",
+        "lat": 49.75,
+        "lng": 15.5,
+        "cca3": "CZE",
+    },
+    {
+        "name": "French Southern and Antarctic Lands",
+        "area": 7747,
+        "cioc": "",
+        "cca2": "TF",
+        "capital": "Port-aux-Francais",
+        "lat": -49.25,
+        "lng": 69.167,
+        "cca3": "ATF",
+    },
+    {
+        "name": "Antigua and Barbuda",
+        "area": 442,
+        "cioc": "ANT",
+        "cca2": "AG",
+        "capital": "Saint John's",
+        "lat": 17.05,
+        "lng": -61.8,
+        "cca3": "ATG",
+    },
+    {
+        "name": "Fiji",
+        "area": 18272,
+        "cioc": "FIJ",
+        "cca2": "FJ",
+        "capital": "Suva",
+        "lat": -18,
+        "lng": 175,
+        "cca3": "FJI",
+    },
+    {
+        "name": "British Indian Ocean Territory",
+        "area": 60,
+        "cioc": "",
+        "cca2": "IO",
+        "capital": "Diego Garcia",
+        "lat": -6,
+        "lng": 71.5,
+        "cca3": "IOT",
+    },
+    {
+        "name": "Honduras",
+        "area": 112492,
+        "cioc": "HON",
+        "cca2": "HN",
+        "capital": "Tegucigalpa",
+        "lat": 15,
+        "lng": -86.5,
+        "cca3": "HND",
+    },
+    {
+        "name": "Mauritius",
+        "area": 2040,
+        "cioc": "MRI",
+        "cca2": "MU",
+        "capital": "Port Louis",
+        "lat": -20.28333333,
+        "lng": 57.55,
+        "cca3": "MUS",
+    },
+    {
+        "name": "Antarctica",
+        "area": 14000000,
+        "cioc": "",
+        "cca2": "AQ",
+        "capital": "",
+        "lat": -90,
+        "lng": 0,
+        "cca3": "ATA",
+    },
+    {
+        "name": "Luxembourg",
+        "area": 2586,
+        "cioc": "LUX",
+        "cca2": "LU",
+        "capital": "Luxembourg",
+        "lat": 49.75,
+        "lng": 6.16666666,
+        "cca3": "LUX",
+    },
+    {
+        "name": "Israel",
+        "area": 20770,
+        "cioc": "ISR",
+        "cca2": "IL",
+        "capital": "Jerusalem",
+        "lat": 31.47,
+        "lng": 35.13,
+        "cca3": "ISR",
+    },
+    {
+        "name": "Micronesia",
+        "area": 702,
+        "cioc": "FSM",
+        "cca2": "FM",
+        "capital": "Palikir",
+        "lat": 6.91666666,
+        "lng": 158.25,
+        "cca3": "FSM",
+    },
+    {
+        "name": "Peru",
+        "area": 1285216,
+        "cioc": "PER",
+        "cca2": "PE",
+        "capital": "Lima",
+        "lat": -10,
+        "lng": -76,
+        "cca3": "PER",
+    },
+    {
+        "name": "Reunion",
+        "area": 2511,
+        "cioc": "",
+        "cca2": "RE",
+        "capital": "Saint-Denis",
+        "lat": -21.15,
+        "lng": 55.5,
+        "cca3": "REU",
+    },
+    {
+        "name": "Indonesia",
+        "area": 1904569,
+        "cioc": "INA",
+        "cca2": "ID",
+        "capital": "Jakarta",
+        "lat": -5,
+        "lng": 120,
+        "cca3": "IDN",
+    },
+    {
+        "name": "Vanuatu",
+        "area": 12189,
+        "cioc": "VAN",
+        "cca2": "VU",
+        "capital": "Port Vila",
+        "lat": -16,
+        "lng": 167,
+        "cca3": "VUT",
+    },
+    {
+        "name": "Macedonia",
+        "area": 25713,
+        "cioc": "MKD",
+        "cca2": "MK",
+        "capital": "Skopje",
+        "lat": 41.83333333,
+        "lng": 22,
+        "cca3": "MKD",
+    },
+    {
+        "name": "DR Congo",
+        "area": 2344858,
+        "cioc": "COD",
+        "cca2": "CD",
+        "capital": "Kinshasa",
+        "lat": 0,
+        "lng": 25,
+        "cca3": "COD",
+    },
+    {
+        "name": "Republic of the Congo",
+        "area": 342000,
+        "cioc": "CGO",
+        "cca2": "CG",
+        "capital": "Brazzaville",
+        "lat": -1,
+        "lng": 15,
+        "cca3": "COG",
+    },
+    {
+        "name": "Iceland",
+        "area": 103000,
+        "cioc": "ISL",
+        "cca2": "IS",
+        "capital": "Reykjavik",
+        "lat": 65,
+        "lng": -18,
+        "cca3": "ISL",
+    },
+    {
+        "name": "Guadeloupe",
+        "area": 1628,
+        "cioc": "",
+        "cca2": "GP",
+        "capital": "Basse-Terre",
+        "lat": 16.25,
+        "lng": -61.583333,
+        "cca3": "GLP",
+    },
+    {
+        "name": "Cook Islands",
+        "area": 236,
+        "cioc": "COK",
+        "cca2": "CK",
+        "capital": "Avarua",
+        "lat": -21.23333333,
+        "lng": -159.76666666,
+        "cca3": "COK",
+    },
+    {
+        "name": "Comoros",
+        "area": 1862,
+        "cioc": "COM",
+        "cca2": "KM",
+        "capital": "Moroni",
+        "lat": -12.16666666,
+        "lng": 44.25,
+        "cca3": "COM",
+    },
+    {
+        "name": "Colombia",
+        "area": 1141748,
+        "cioc": "COL",
+        "cca2": "CO",
+        "capital": "Bogota",
+        "lat": 4,
+        "lng": -72,
+        "cca3": "COL",
+    },
+    {
+        "name": "Nigeria",
+        "area": 923768,
+        "cioc": "NGR",
+        "cca2": "NG",
+        "capital": "Abuja",
+        "lat": 10,
+        "lng": 8,
+        "cca3": "NGA",
+    },
+    {
+        "name": "Timor-Leste",
+        "area": 14874,
+        "cioc": "TLS",
+        "cca2": "TL",
+        "capital": "Dili",
+        "lat": -8.83333333,
+        "lng": 125.91666666,
+        "cca3": "TLS",
+    },
+    {
+        "name": "Taiwan",
+        "area": 36193,
+        "cioc": "TPE",
+        "cca2": "TW",
+        "capital": "Taipei",
+        "lat": 23.5,
+        "lng": 121,
+        "cca3": "TWN",
+    },
+    {
+        "name": "Portugal",
+        "area": 92090,
+        "cioc": "POR",
+        "cca2": "PT",
+        "capital": "Lisbon",
+        "lat": 39.5,
+        "lng": -8,
+        "cca3": "PRT",
+    },
+    {
+        "name": "Moldova",
+        "area": 33846,
+        "cioc": "MDA",
+        "cca2": "MD",
+        "capital": "Chisinau",
+        "lat": 47,
+        "lng": 29,
+        "cca3": "MDA",
+    },
+    {
+        "name": "Guernsey",
+        "area": 78,
+        "cioc": "",
+        "cca2": "GG",
+        "capital": "St. Peter Port",
+        "lat": 49.46666666,
+        "lng": -2.58333333,
+        "cca3": "GGY",
+    },
+    {
+        "name": "Madagascar",
+        "area": 587041,
+        "cioc": "MAD",
+        "cca2": "MG",
+        "capital": "Antananarivo",
+        "lat": -20,
+        "lng": 47,
+        "cca3": "MDG",
+    },
+    {
+        "name": "Ecuador",
+        "area": 276841,
+        "cioc": "ECU",
+        "cca2": "EC",
+        "capital": "Quito",
+        "lat": -2,
+        "lng": -77.5,
+        "cca3": "ECU",
+    },
+    {
+        "name": "Senegal",
+        "area": 196722,
+        "cioc": "SEN",
+        "cca2": "SN",
+        "capital": "Dakar",
+        "lat": 14,
+        "lng": -14,
+        "cca3": "SEN",
+    },
+    {
+        "name": "New Zealand",
+        "area": 270467,
+        "cioc": "NZL",
+        "cca2": "NZ",
+        "capital": "Wellington",
+        "lat": -41,
+        "lng": 174,
+        "cca3": "NZL",
+    },
+    {
+        "name": "Maldives",
+        "area": 300,
+        "cioc": "MDV",
+        "cca2": "MV",
+        "capital": "Male",
+        "lat": 3.25,
+        "lng": 73,
+        "cca3": "MDV",
+    },
+    {
+        "name": "American Samoa",
+        "area": 199,
+        "cioc": "ASA",
+        "cca2": "AS",
+        "capital": "Pago Pago",
+        "lat": -14.33333333,
+        "lng": -170,
+        "cca3": "ASM",
+    },
+    {
+        "name": "Saint Pierre and Miquelon",
+        "area": 242,
+        "cioc": "",
+        "cca2": "PM",
+        "capital": "Saint-Pierre",
+        "lat": 46.83333333,
+        "lng": -56.33333333,
+        "cca3": "SPM",
+    },
+    {
+        "name": "Curacao",
+        "area": 444,
+        "cioc": "",
+        "cca2": "CW",
+        "capital": "Willemstad",
+        "lat": 12.116667,
+        "lng": -68.933333,
+        "cca3": "CUW",
+    },
+    {
+        "name": "France",
+        "area": 551695,
+        "cioc": "FRA",
+        "cca2": "FR",
+        "capital": "Paris",
+        "lat": 46,
+        "lng": 2,
+        "cca3": "FRA",
+    },
+    {
+        "name": "Lithuania",
+        "area": 65300,
+        "cioc": "LTU",
+        "cca2": "LT",
+        "capital": "Vilnius",
+        "lat": 56,
+        "lng": 24,
+        "cca3": "LTU",
+    },
+    {
+        "name": "Rwanda",
+        "area": 26338,
+        "cioc": "RWA",
+        "cca2": "RW",
+        "capital": "Kigali",
+        "lat": -2,
+        "lng": 30,
+        "cca3": "RWA",
+    },
+    {
+        "name": "Zambia",
+        "area": 752612,
+        "cioc": "ZAM",
+        "cca2": "ZM",
+        "capital": "Lusaka",
+        "lat": -15,
+        "lng": 30,
+        "cca3": "ZMB",
+    },
+    {
+        "name": "Gambia",
+        "area": 10689,
+        "cioc": "GAM",
+        "cca2": "GM",
+        "capital": "Banjul",
+        "lat": 13.46666666,
+        "lng": -16.56666666,
+        "cca3": "GMB",
+    },
+    {
+        "name": "Wallis and Futuna",
+        "area": 142,
+        "cioc": "",
+        "cca2": "WF",
+        "capital": "Mata-Utu",
+        "lat": -13.3,
+        "lng": -176.2,
+        "cca3": "WLF",
+    },
+    {
+        "name": "Jersey",
+        "area": 116,
+        "cioc": "",
+        "cca2": "JE",
+        "capital": "Saint Helier",
+        "lat": 49.25,
+        "lng": -2.16666666,
+        "cca3": "JEY",
+    },
+    {
+        "name": "Faroe Islands",
+        "area": 1393,
+        "cioc": "",
+        "cca2": "FO",
+        "capital": "Torshavn",
+        "lat": 62,
+        "lng": -7,
+        "cca3": "FRO",
+    },
+    {
+        "name": "Guatemala",
+        "area": 108889,
+        "cioc": "GUA",
+        "cca2": "GT",
+        "capital": "Guatemala City",
+        "lat": 15.5,
+        "lng": -90.25,
+        "cca3": "GTM",
+    },
+    {
+        "name": "Denmark",
+        "area": 43094,
+        "cioc": "DEN",
+        "cca2": "DK",
+        "capital": "Copenhagen",
+        "lat": 56,
+        "lng": 10,
+        "cca3": "DNK",
+    },
+    {
+        "name": "Isle of Man",
+        "area": 572,
+        "cioc": "",
+        "cca2": "IM",
+        "capital": "Douglas",
+        "lat": 54.25,
+        "lng": -4.5,
+        "cca3": "IMN",
+    },
+    {
+        "name": "Australia",
+        "area": 7692024,
+        "cioc": "AUS",
+        "cca2": "AU",
+        "capital": "Canberra",
+        "lat": -27,
+        "lng": 133,
+        "cca3": "AUS",
+    },
+    {
+        "name": "Austria",
+        "area": 83871,
+        "cioc": "AUT",
+        "cca2": "AT",
+        "capital": "Vienna",
+        "lat": 47.33333333,
+        "lng": 13.33333333,
+        "cca3": "AUT",
+    },
+    {
+        "name": "Svalbard and Jan Mayen",
+        "area": -1,
+        "cioc": "",
+        "cca2": "SJ",
+        "capital": "Longyearbyen",
+        "lat": 78,
+        "lng": 20,
+        "cca3": "SJM",
+    },
+    {
+        "name": "Venezuela",
+        "area": 916445,
+        "cioc": "VEN",
+        "cca2": "VE",
+        "capital": "Caracas",
+        "lat": 8,
+        "lng": -66,
+        "cca3": "VEN",
+    },
+    {
+        "name": "Kosovo",
+        "area": 10908,
+        "cioc": "KOS",
+        "cca2": "XK",
+        "capital": "Pristina",
+        "lat": 42.666667,
+        "lng": 21.166667,
+        "cca3": "UNK",
+    },
+    {
+        "name": "Palau",
+        "area": 459,
+        "cioc": "PLW",
+        "cca2": "PW",
+        "capital": "Ngerulmud",
+        "lat": 7.5,
+        "lng": 134.5,
+        "cca3": "PLW",
+    },
+    {
+        "name": "Kenya",
+        "area": 580367,
+        "cioc": "KEN",
+        "cca2": "KE",
+        "capital": "Nairobi",
+        "lat": 1,
+        "lng": 38,
+        "cca3": "KEN",
+    },
+    {
+        "name": "Samoa",
+        "area": 2842,
+        "cioc": "SAM",
+        "cca2": "WS",
+        "capital": "Apia",
+        "lat": -13.58333333,
+        "lng": -172.33333333,
+        "cca3": "WSM",
+    },
+    {
+        "name": "Turkey",
+        "area": 783562,
+        "cioc": "TUR",
+        "cca2": "TR",
+        "capital": "Ankara",
+        "lat": 39,
+        "lng": 35,
+        "cca3": "TUR",
+    },
+    {
+        "name": "Albania",
+        "area": 28748,
+        "cioc": "ALB",
+        "cca2": "AL",
+        "capital": "Tirana",
+        "lat": 41,
+        "lng": 20,
+        "cca3": "ALB",
+    },
+    {
+        "name": "Oman",
+        "area": 309500,
+        "cioc": "OMA",
+        "cca2": "OM",
+        "capital": "Muscat",
+        "lat": 21,
+        "lng": 57,
+        "cca3": "OMN",
+    },
+    {
+        "name": "Tuvalu",
+        "area": 26,
+        "cioc": "TUV",
+        "cca2": "TV",
+        "capital": "Funafuti",
+        "lat": -8,
+        "lng": 178,
+        "cca3": "TUV",
+    },
+    {
+        "name": "Aland Islands",
+        "area": 1580,
+        "cioc": "",
+        "cca2": "AX",
+        "capital": "Mariehamn",
+        "lat": 60.116667,
+        "lng": 19.9,
+        "cca3": "ALA",
+    },
+    {
+        "name": "Brunei",
+        "area": 5765,
+        "cioc": "BRU",
+        "cca2": "BN",
+        "capital": "Bandar Seri Begawan",
+        "lat": 4.5,
+        "lng": 114.66666666,
+        "cca3": "BRN",
+    },
+    {
+        "name": "Tunisia",
+        "area": 163610,
+        "cioc": "TUN",
+        "cca2": "TN",
+        "capital": "Tunis",
+        "lat": 34,
+        "lng": 9,
+        "cca3": "TUN",
+    },
+    {
+        "name": "Pitcairn Islands",
+        "area": 47,
+        "cioc": "",
+        "cca2": "PN",
+        "capital": "Adamstown",
+        "lat": -25.06666666,
+        "lng": -130.1,
+        "cca3": "PCN",
+    },
+    {
+        "name": "Barbados",
+        "area": 430,
+        "cioc": "BAR",
+        "cca2": "BB",
+        "capital": "Bridgetown",
+        "lat": 13.16666666,
+        "lng": -59.53333333,
+        "cca3": "BRB",
+    },
+    {
+        "name": "Brazil",
+        "area": 8515767,
+        "cioc": "BRA",
+        "cca2": "BR",
+        "capital": "Brasilia",
+        "lat": -10,
+        "lng": -55,
+        "cca3": "BRA",
+    },
+    {
+        "name": "Ivory Coast",
+        "area": 322463,
+        "cioc": "CIV",
+        "cca2": "CI",
+        "capital": "Yamoussoukro",
+        "lat": 8,
+        "lng": -5,
+        "cca3": "CIV",
+    },
+    {
+        "name": "Serbia",
+        "area": 88361,
+        "cioc": "SRB",
+        "cca2": "RS",
+        "capital": "Belgrade",
+        "lat": 44,
+        "lng": 21,
+        "cca3": "SRB",
+    },
+    {
+        "name": "Equatorial Guinea",
+        "area": 28051,
+        "cioc": "GEQ",
+        "cca2": "GQ",
+        "capital": "Malabo",
+        "lat": 2,
+        "lng": 10,
+        "cca3": "GNQ",
+    },
+    {
+        "name": "United States",
+        "area": 9372610,
+        "cioc": "USA",
+        "cca2": "US",
+        "capital": "Washington D.C.",
+        "lat": 38,
+        "lng": -97,
+        "cca3": "USA",
+    },
+    {
+        "name": "Qatar",
+        "area": 11586,
+        "cioc": "QAT",
+        "cca2": "QA",
+        "capital": "Doha",
+        "lat": 25.5,
+        "lng": 51.25,
+        "cca3": "QAT",
+    },
+    {
+        "name": "Sweden",
+        "area": 450295,
+        "cioc": "SWE",
+        "cca2": "SE",
+        "capital": "Stockholm",
+        "lat": 62,
+        "lng": 15,
+        "cca3": "SWE",
+    },
+    {
+        "name": "Azerbaijan",
+        "area": 86600,
+        "cioc": "AZE",
+        "cca2": "AZ",
+        "capital": "Baku",
+        "lat": 40.5,
+        "lng": 47.5,
+        "cca3": "AZE",
+    },
+    {
+        "name": "Guinea-Bissau",
+        "area": 36125,
+        "cioc": "GBS",
+        "cca2": "GW",
+        "capital": "Bissau",
+        "lat": 12,
+        "lng": -15,
+        "cca3": "GNB",
+    },
+    {
+        "name": "Swaziland",
+        "area": 17364,
+        "cioc": "SWZ",
+        "cca2": "SZ",
+        "capital": "Lobamba",
+        "lat": -26.5,
+        "lng": 31.5,
+        "cca3": "SWZ",
+    },
+    {
+        "name": "Tonga",
+        "area": 747,
+        "cioc": "TGA",
+        "cca2": "TO",
+        "capital": "Nuku'alofa",
+        "lat": -20,
+        "lng": -175,
+        "cca3": "TON",
+    },
+    {
+        "name": "Canada",
+        "area": 9984670,
+        "cioc": "CAN",
+        "cca2": "CA",
+        "capital": "Ottawa",
+        "lat": 60,
+        "lng": -95,
+        "cca3": "CAN",
+    },
+    {
+        "name": "Ukraine",
+        "area": 603500,
+        "cioc": "UKR",
+        "cca2": "UA",
+        "capital": "Kiev",
+        "lat": 49,
+        "lng": 32,
+        "cca3": "UKR",
+    },
+    {
+        "name": "South Korea",
+        "area": 100210,
+        "cioc": "KOR",
+        "cca2": "KR",
+        "capital": "Seoul",
+        "lat": 37,
+        "lng": 127.5,
+        "cca3": "KOR",
+    },
+    {
+        "name": "Anguilla",
+        "area": 91,
+        "cioc": "",
+        "cca2": "AI",
+        "capital": "The Valley",
+        "lat": 18.25,
+        "lng": -63.16666666,
+        "cca3": "AIA",
+    },
+    {
+        "name": "Central African Republic",
+        "area": 622984,
+        "cioc": "CAF",
+        "cca2": "CF",
+        "capital": "Bangui",
+        "lat": 7,
+        "lng": 21,
+        "cca3": "CAF",
+    },
+    {
+        "name": "Slovakia",
+        "area": 49037,
+        "cioc": "SVK",
+        "cca2": "SK",
+        "capital": "Bratislava",
+        "lat": 48.66666666,
+        "lng": 19.5,
+        "cca3": "SVK",
+    },
+    {
+        "name": "Cyprus",
+        "area": 9251,
+        "cioc": "CYP",
+        "cca2": "CY",
+        "capital": "Nicosia",
+        "lat": 35,
+        "lng": 33,
+        "cca3": "CYP",
+    },
+    {
+        "name": "Bosnia and Herzegovina",
+        "area": 51209,
+        "cioc": "BIH",
+        "cca2": "BA",
+        "capital": "Sarajevo",
+        "lat": 44,
+        "lng": 18,
+        "cca3": "BIH",
+    },
+    {
+        "name": "Singapore",
+        "area": 710,
+        "cioc": "SIN",
+        "cca2": "SG",
+        "capital": "Singapore",
+        "lat": 1.36666666,
+        "lng": 103.8,
+        "cca3": "SGP",
+    },
+    {
+        "name": "South Georgia",
+        "area": 3903,
+        "cioc": "",
+        "cca2": "GS",
+        "capital": "King Edward Point",
+        "lat": -54.5,
+        "lng": -37,
+        "cca3": "SGS",
+    },
+    {
+        "name": "Somalia",
+        "area": 637657,
+        "cioc": "SOM",
+        "cca2": "SO",
+        "capital": "Mogadishu",
+        "lat": 10,
+        "lng": 49,
+        "cca3": "SOM",
+    },
+    {
+        "name": "Uzbekistan",
+        "area": 447400,
+        "cioc": "UZB",
+        "cca2": "UZ",
+        "capital": "Tashkent",
+        "lat": 41,
+        "lng": 64,
+        "cca3": "UZB",
+    },
+    {
+        "name": "Eritrea",
+        "area": 117600,
+        "cioc": "ERI",
+        "cca2": "ER",
+        "capital": "Asmara",
+        "lat": 15,
+        "lng": 39,
+        "cca3": "ERI",
+    },
+    {
+        "name": "Poland",
+        "area": 312679,
+        "cioc": "POL",
+        "cca2": "PL",
+        "capital": "Warsaw",
+        "lat": 52,
+        "lng": 20,
+        "cca3": "POL",
+    },
+    {
+        "name": "Kuwait",
+        "area": 17818,
+        "cioc": "KUW",
+        "cca2": "KW",
+        "capital": "Kuwait City",
+        "lat": 29.5,
+        "lng": 45.75,
+        "cca3": "KWT",
+    },
+    {
+        "name": "Gabon",
+        "area": 267668,
+        "cioc": "GAB",
+        "cca2": "GA",
+        "capital": "Libreville",
+        "lat": -1,
+        "lng": 11.75,
+        "cca3": "GAB",
+    },
+    {
+        "name": "Cayman Islands",
+        "area": 264,
+        "cioc": "CAY",
+        "cca2": "KY",
+        "capital": "George Town",
+        "lat": 19.5,
+        "lng": -80.5,
+        "cca3": "CYM",
+    },
+    {
+        "name": "Vatican City",
+        "area": 0.44,
+        "cioc": "",
+        "cca2": "VA",
+        "capital": "Vatican City",
+        "lat": 41.9,
+        "lng": 12.45,
+        "cca3": "VAT",
+    },
+    {
+        "name": "Estonia",
+        "area": 45227,
+        "cioc": "EST",
+        "cca2": "EE",
+        "capital": "Tallinn",
+        "lat": 59,
+        "lng": 26,
+        "cca3": "EST",
+    },
+    {
+        "name": "Malawi",
+        "area": 118484,
+        "cioc": "MAW",
+        "cca2": "MW",
+        "capital": "Lilongwe",
+        "lat": -13.5,
+        "lng": 34,
+        "cca3": "MWI",
+    },
+    {
+        "name": "Spain",
+        "area": 505992,
+        "cioc": "ESP",
+        "cca2": "ES",
+        "capital": "Madrid",
+        "lat": 40,
+        "lng": -4,
+        "cca3": "ESP",
+    },
+    {
+        "name": "Iraq",
+        "area": 438317,
+        "cioc": "IRQ",
+        "cca2": "IQ",
+        "capital": "Baghdad",
+        "lat": 33,
+        "lng": 44,
+        "cca3": "IRQ",
+    },
+    {
+        "name": "El Salvador",
+        "area": 21041,
+        "cioc": "ESA",
+        "cca2": "SV",
+        "capital": "San Salvador",
+        "lat": 13.83333333,
+        "lng": -88.91666666,
+        "cca3": "SLV",
+    },
+    {
+        "name": "Mali",
+        "area": 1240192,
+        "cioc": "MLI",
+        "cca2": "ML",
+        "capital": "Bamako",
+        "lat": 17,
+        "lng": -4,
+        "cca3": "MLI",
+    },
+    {
+        "name": "Ireland",
+        "area": 70273,
+        "cioc": "IRL",
+        "cca2": "IE",
+        "capital": "Dublin",
+        "lat": 53,
+        "lng": -8,
+        "cca3": "IRL",
+    },
+    {
+        "name": "Iran",
+        "area": 1648195,
+        "cioc": "IRI",
+        "cca2": "IR",
+        "capital": "Tehran",
+        "lat": 32,
+        "lng": 53,
+        "cca3": "IRN",
+    },
+    {
+        "name": "Aruba",
+        "area": 180,
+        "cioc": "ARU",
+        "cca2": "AW",
+        "capital": "Oranjestad",
+        "lat": 12.5,
+        "lng": -69.96666666,
+        "cca3": "ABW",
+    },
+    {
+        "name": "Papua New Guinea",
+        "area": 462840,
+        "cioc": "PNG",
+        "cca2": "PG",
+        "capital": "Port Moresby",
+        "lat": -6,
+        "lng": 147,
+        "cca3": "PNG",
+    },
+    {
+        "name": "Panama",
+        "area": 75417,
+        "cioc": "PAN",
+        "cca2": "PA",
+        "capital": "Panama City",
+        "lat": 9,
+        "lng": -80,
+        "cca3": "PAN",
+    },
+    {
+        "name": "Sudan",
+        "area": 1886068,
+        "cioc": "SUD",
+        "cca2": "SD",
+        "capital": "Khartoum",
+        "lat": 15,
+        "lng": 30,
+        "cca3": "SDN",
+    },
+    {
+        "name": "Solomon Islands",
+        "area": 28896,
+        "cioc": "SOL",
+        "cca2": "SB",
+        "capital": "Honiara",
+        "lat": -8,
+        "lng": 159,
+        "cca3": "SLB",
+    },
+    {
+        "name": "Western Sahara",
+        "area": 266000,
+        "cioc": "",
+        "cca2": "EH",
+        "capital": "El Aaiun",
+        "lat": 24.5,
+        "lng": -13,
+        "cca3": "ESH",
+    },
+    {
+        "name": "Monaco",
+        "area": 2.02,
+        "cioc": "MON",
+        "cca2": "MC",
+        "capital": "Monaco",
+        "lat": 43.73333333,
+        "lng": 7.4,
+        "cca3": "MCO",
+    },
+    {
+        "name": "Italy",
+        "area": 301336,
+        "cioc": "ITA",
+        "cca2": "IT",
+        "capital": "Rome",
+        "lat": 42.83333333,
+        "lng": 12.83333333,
+        "cca3": "ITA",
+    },
+    {
+        "name": "Japan",
+        "area": 377930,
+        "cioc": "JPN",
+        "cca2": "JP",
+        "capital": "Tokyo",
+        "lat": 36,
+        "lng": 138,
+        "cca3": "JPN",
+    },
+    {
+        "name": "Kyrgyzstan",
+        "area": 199951,
+        "cioc": "KGZ",
+        "cca2": "KG",
+        "capital": "Bishkek",
+        "lat": 41,
+        "lng": 75,
+        "cca3": "KGZ",
+    },
+    {
+        "name": "Uganda",
+        "area": 241550,
+        "cioc": "UGA",
+        "cca2": "UG",
+        "capital": "Kampala",
+        "lat": 1,
+        "lng": 32,
+        "cca3": "UGA",
+    },
+    {
+        "name": "New Caledonia",
+        "area": 18575,
+        "cioc": "",
+        "cca2": "NC",
+        "capital": "Noumea",
+        "lat": -21.5,
+        "lng": 165.5,
+        "cca3": "NCL",
+    },
+    {
+        "name": "United Arab Emirates",
+        "area": 83600,
+        "cioc": "UAE",
+        "cca2": "AE",
+        "capital": "Abu Dhabi",
+        "lat": 24,
+        "lng": 54,
+        "cca3": "ARE",
+    },
+    {
+        "name": "Argentina",
+        "area": 2780400,
+        "cioc": "ARG",
+        "cca2": "AR",
+        "capital": "Buenos Aires",
+        "lat": -34,
+        "lng": -64,
+        "cca3": "ARG",
+    },
+    {
+        "name": "Bahamas",
+        "area": 13943,
+        "cioc": "BAH",
+        "cca2": "BS",
+        "capital": "Nassau",
+        "lat": 24.25,
+        "lng": -76,
+        "cca3": "BHS",
+    },
+    {
+        "name": "Bahrain",
+        "area": 765,
+        "cioc": "BRN",
+        "cca2": "BH",
+        "capital": "Manama",
+        "lat": 26,
+        "lng": 50.55,
+        "cca3": "BHR",
+    },
+    {
+        "name": "Armenia",
+        "area": 29743,
+        "cioc": "ARM",
+        "cca2": "AM",
+        "capital": "Yerevan",
+        "lat": 40,
+        "lng": 45,
+        "cca3": "ARM",
+    },
+    {
+        "name": "Nauru",
+        "area": 21,
+        "cioc": "NRU",
+        "cca2": "NR",
+        "capital": "Yaren",
+        "lat": -0.53333333,
+        "lng": 166.91666666,
+        "cca3": "NRU",
+    },
+    {
+        "name": "Cuba",
+        "area": 109884,
+        "cioc": "CUB",
+        "cca2": "CU",
+        "capital": "Havana",
+        "lat": 21.5,
+        "lng": -80,
+        "cca3": "CUB",
+    },
+]
+
+all_lookups: Dict[str, Dict[str, Dict[str, Any]]] = {}
+lookups = ["cioc", "cca2", "cca3", "name"]
+for lookup in lookups:
+    all_lookups[lookup] = {}
+    for country in countries:
+        all_lookups[lookup][country[lookup].lower()] = country
+
+
+def get(field: str, symbol: str) -> Optional[Dict[str, Any]]:
+    """
+    Get country data based on a standard code and a symbol
+    """
+    return all_lookups[field].get(symbol.lower())

+ 114 - 0
data/purposeCombined/BI/examples/country_map.py

@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import datetime
+
+import pandas as pd
+from sqlalchemy import BigInteger, Date, String
+from sqlalchemy.sql import column
+
+from superset import db
+from superset.connectors.sqla.models import SqlMetric
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+)
+
+
+def load_country_map_data(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading data for map with country map"""
+    tbl_name = "birth_france_by_region"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        csv_bytes = get_example_data(
+            "birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True
+        )
+        data = pd.read_csv(csv_bytes, encoding="utf-8")
+        data["dttm"] = datetime.datetime.now().date()
+        data.to_sql(  # pylint: disable=no-member
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "DEPT_ID": String(10),
+                "2003": BigInteger,
+                "2004": BigInteger,
+                "2005": BigInteger,
+                "2006": BigInteger,
+                "2007": BigInteger,
+                "2008": BigInteger,
+                "2009": BigInteger,
+                "2010": BigInteger,
+                "2011": BigInteger,
+                "2012": BigInteger,
+                "2013": BigInteger,
+                "2014": BigInteger,
+                "dttm": Date(),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print("Creating table reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "dttm"
+    obj.database = database
+    if not any(col.metric_name == "avg__2004" for col in obj.metrics):
+        col = str(column("2004").compile(db.engine))
+        obj.metrics.append(SqlMetric(metric_name="avg__2004", expression=f"AVG({col})"))
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "",
+        "since": "",
+        "until": "",
+        "viz_type": "country_map",
+        "entity": "DEPT_ID",
+        "metric": {
+            "expressionType": "SIMPLE",
+            "column": {"type": "INT", "column_name": "2004"},
+            "aggregate": "AVG",
+            "label": "Boys",
+            "optionName": "metric_112342",
+        },
+        "row_limit": 500000,
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Birth in France by department in 2016",
+        viz_type="country_map",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 100 - 0
data/purposeCombined/BI/examples/css_templates.py

@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import textwrap
+
+from superset import db
+from superset.models.core import CssTemplate
+
+
+def load_css_templates() -> None:
+    """Loads 2 css templates to demonstrate the feature"""
+    print("Creating default CSS templates")
+
+    obj = db.session.query(CssTemplate).filter_by(template_name="Flat").first()
+    if not obj:
+        obj = CssTemplate(template_name="Flat")
+    css = textwrap.dedent(
+        """\
+    .navbar {
+        transition: opacity 0.5s ease;
+        opacity: 0.05;
+    }
+    .navbar:hover {
+        opacity: 1;
+    }
+    .chart-header .header{
+        font-weight: @font-weight-normal;
+        font-size: 12px;
+    }
+    /*
+    var bnbColors = [
+        //rausch    hackb      kazan      babu      lima        beach     tirol
+        '#ff5a5f', '#7b0051', '#007A87', '#00d1c1', '#8ce071', '#ffb400', '#b4a76c',
+        '#ff8083', '#cc0086', '#00a1b3', '#00ffeb', '#bbedab', '#ffd266', '#cbc29a',
+        '#ff3339', '#ff1ab1', '#005c66', '#00b3a5', '#55d12e', '#b37e00', '#988b4e',
+     ];
+    */
+    """
+    )
+    obj.css = css
+    db.session.merge(obj)
+    db.session.commit()
+
+    obj = db.session.query(CssTemplate).filter_by(template_name="Courier Black").first()
+    if not obj:
+        obj = CssTemplate(template_name="Courier Black")
+    css = textwrap.dedent(
+        """\
+    h2 {
+        color: white;
+        font-size: 52px;
+    }
+    .navbar {
+        box-shadow: none;
+    }
+    .navbar {
+        transition: opacity 0.5s ease;
+        opacity: 0.05;
+    }
+    .navbar:hover {
+        opacity: 1;
+    }
+    .chart-header .header{
+        font-weight: @font-weight-normal;
+        font-size: 12px;
+    }
+    .nvd3 text {
+        font-size: 12px;
+        font-family: inherit;
+    }
+    body{
+        background: #000;
+        font-family: Courier, Monaco, monospace;;
+    }
+    /*
+    var bnbColors = [
+        //rausch    hackb      kazan      babu      lima        beach     tirol
+        '#ff5a5f', '#7b0051', '#007A87', '#00d1c1', '#8ce071', '#ffb400', '#b4a76c',
+        '#ff8083', '#cc0086', '#00a1b3', '#00ffeb', '#bbedab', '#ffd266', '#cbc29a',
+        '#ff3339', '#ff1ab1', '#005c66', '#00b3a5', '#55d12e', '#b37e00', '#988b4e',
+     ];
+    */
+    """
+    )
+    obj.css = css
+    db.session.merge(obj)
+    db.session.commit()

+ 529 - 0
data/purposeCombined/BI/examples/deck.py

@@ -0,0 +1,529 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-statements
+import json
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import get_slice_json, merge_slice, TBL, update_slice_ids
+
+COLOR_RED = {"r": 205, "g": 0, "b": 3, "a": 0.82}
+POSITION_JSON = """\
+{
+    "CHART-3afd9d70": {
+        "meta": {
+            "chartId": 66,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-3afd9d70",
+        "children": []
+    },
+    "CHART-2ee7fa5e": {
+        "meta": {
+            "chartId": 67,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-2ee7fa5e",
+        "children": []
+    },
+    "CHART-201f7715": {
+        "meta": {
+            "chartId": 68,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-201f7715",
+        "children": []
+    },
+    "CHART-d02f6c40": {
+        "meta": {
+            "chartId": 69,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-d02f6c40",
+        "children": []
+    },
+    "CHART-2673431d": {
+        "meta": {
+            "chartId": 70,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-2673431d",
+        "children": []
+    },
+    "CHART-85265a60": {
+        "meta": {
+            "chartId": 71,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-85265a60",
+        "children": []
+    },
+    "CHART-2b87513c": {
+        "meta": {
+            "chartId": 72,
+            "width": 6,
+            "height": 50
+        },
+        "type": "CHART",
+        "id": "CHART-2b87513c",
+        "children": []
+    },
+    "GRID_ID": {
+        "type": "GRID",
+        "id": "GRID_ID",
+        "children": [
+            "ROW-a7b16cb5",
+            "ROW-72c218a5",
+            "ROW-957ba55b",
+            "ROW-af041bdd"
+        ]
+    },
+    "HEADER_ID": {
+        "meta": {
+            "text": "deck.gl Demo"
+        },
+        "type": "HEADER",
+        "id": "HEADER_ID"
+    },
+    "ROOT_ID": {
+        "type": "ROOT",
+        "id": "ROOT_ID",
+        "children": [
+            "GRID_ID"
+        ]
+    },
+    "ROW-72c218a5": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-72c218a5",
+        "children": [
+            "CHART-d02f6c40",
+            "CHART-201f7715"
+        ]
+    },
+    "ROW-957ba55b": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-957ba55b",
+        "children": [
+            "CHART-2673431d",
+            "CHART-85265a60"
+        ]
+    },
+    "ROW-a7b16cb5": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-a7b16cb5",
+        "children": [
+            "CHART-3afd9d70",
+            "CHART-2ee7fa5e"
+        ]
+    },
+    "ROW-af041bdd": {
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW",
+        "id": "ROW-af041bdd",
+        "children": [
+            "CHART-2b87513c"
+        ]
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}"""
+
+
+def load_deck_dash() -> None:
+    print("Loading deck.gl dashboard")
+    slices = []
+    tbl = db.session.query(TBL).filter_by(table_name="long_lat").first()
+    slice_data = {
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "color_picker": COLOR_RED,
+        "datasource": "5__table",
+        "granularity_sqla": None,
+        "groupby": [],
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "multiplier": 10,
+        "point_radius_fixed": {"type": "metric", "value": "count"},
+        "point_unit": "square_m",
+        "min_radius": 1,
+        "max_radius": 250,
+        "row_limit": 5000,
+        "time_range": " : ",
+        "size": "count",
+        "time_grain_sqla": None,
+        "viewport": {
+            "bearing": -4.952916738791771,
+            "latitude": 37.78926922909199,
+            "longitude": -122.42613341901688,
+            "pitch": 4.750411100577438,
+            "zoom": 12.729132798697304,
+        },
+        "viz_type": "deck_scatter",
+    }
+
+    print("Creating Scatterplot slice")
+    slc = Slice(
+        slice_name="Scatterplot",
+        viz_type="deck_scatter",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "point_unit": "square_m",
+        "row_limit": 5000,
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "mapbox_style": "mapbox://styles/mapbox/dark-v9",
+        "granularity_sqla": None,
+        "size": "count",
+        "viz_type": "deck_screengrid",
+        "time_range": "No filter",
+        "point_radius": "Auto",
+        "color_picker": {"a": 1, "r": 14, "b": 0, "g": 255},
+        "grid_size": 20,
+        "viewport": {
+            "zoom": 14.161641703941438,
+            "longitude": -122.41827069521386,
+            "bearing": -4.952916738791771,
+            "latitude": 37.76024135844065,
+            "pitch": 4.750411100577438,
+        },
+        "point_radius_fixed": {"type": "fix", "value": 2000},
+        "datasource": "5__table",
+        "time_grain_sqla": None,
+        "groupby": [],
+    }
+    print("Creating Screen Grid slice")
+    slc = Slice(
+        slice_name="Screen grid",
+        viz_type="deck_screengrid",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/streets-v9",
+        "granularity_sqla": None,
+        "size": "count",
+        "viz_type": "deck_hex",
+        "time_range": "No filter",
+        "point_radius_unit": "Pixels",
+        "point_radius": "Auto",
+        "color_picker": {"a": 1, "r": 14, "b": 0, "g": 255},
+        "grid_size": 40,
+        "extruded": True,
+        "viewport": {
+            "latitude": 37.789795085160335,
+            "pitch": 54.08961642447763,
+            "zoom": 13.835465702403654,
+            "longitude": -122.40632230075536,
+            "bearing": -2.3984797349335167,
+        },
+        "point_radius_fixed": {"type": "fix", "value": 2000},
+        "datasource": "5__table",
+        "time_grain_sqla": None,
+        "groupby": [],
+    }
+    print("Creating Hex slice")
+    slc = Slice(
+        slice_name="Hexagons",
+        viz_type="deck_hex",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "spatial": {"type": "latlong", "lonCol": "LON", "latCol": "LAT"},
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/satellite-streets-v9",
+        "granularity_sqla": None,
+        "size": "count",
+        "viz_type": "deck_grid",
+        "point_radius_unit": "Pixels",
+        "point_radius": "Auto",
+        "time_range": "No filter",
+        "color_picker": {"a": 1, "r": 14, "b": 0, "g": 255},
+        "grid_size": 120,
+        "extruded": True,
+        "viewport": {
+            "longitude": -122.42066918995666,
+            "bearing": 155.80099696026355,
+            "zoom": 12.699690845482069,
+            "latitude": 37.7942314882596,
+            "pitch": 53.470800300695146,
+        },
+        "point_radius_fixed": {"type": "fix", "value": 2000},
+        "datasource": "5__table",
+        "time_grain_sqla": None,
+        "groupby": [],
+    }
+    print("Creating Grid slice")
+    slc = Slice(
+        slice_name="Grid",
+        viz_type="deck_grid",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    polygon_tbl = (
+        db.session.query(TBL).filter_by(table_name="sf_population_polygons").first()
+    )
+    slice_data = {
+        "datasource": "11__table",
+        "viz_type": "deck_polygon",
+        "slice_id": 41,
+        "granularity_sqla": None,
+        "time_grain_sqla": None,
+        "time_range": " : ",
+        "line_column": "contour",
+        "metric": {
+            "aggregate": "SUM",
+            "column": {
+                "column_name": "population",
+                "description": None,
+                "expression": None,
+                "filterable": True,
+                "groupby": True,
+                "id": 1332,
+                "is_dttm": False,
+                "optionName": "_col_population",
+                "python_date_format": None,
+                "type": "BIGINT",
+                "verbose_name": None,
+            },
+            "expressionType": "SIMPLE",
+            "hasCustomLabel": True,
+            "label": "Population",
+            "optionName": "metric_t2v4qbfiz1_w6qgpx4h2p",
+            "sqlExpression": None,
+        },
+        "line_type": "json",
+        "linear_color_scheme": "oranges",
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "viewport": {
+            "longitude": -122.43388541747726,
+            "latitude": 37.752020331384834,
+            "zoom": 11.133995608594631,
+            "bearing": 37.89506450385642,
+            "pitch": 60,
+            "width": 667,
+            "height": 906,
+            "altitude": 1.5,
+            "maxZoom": 20,
+            "minZoom": 0,
+            "maxPitch": 60,
+            "minPitch": 0,
+            "maxLatitude": 85.05113,
+            "minLatitude": -85.05113,
+        },
+        "reverse_long_lat": False,
+        "fill_color_picker": {"r": 3, "g": 65, "b": 73, "a": 1},
+        "stroke_color_picker": {"r": 0, "g": 122, "b": 135, "a": 1},
+        "filled": True,
+        "stroked": False,
+        "extruded": True,
+        "multiplier": 0.1,
+        "point_radius_fixed": {
+            "type": "metric",
+            "value": {
+                "aggregate": None,
+                "column": None,
+                "expressionType": "SQL",
+                "hasCustomLabel": None,
+                "label": "Density",
+                "optionName": "metric_c5rvwrzoo86_293h6yrv2ic",
+                "sqlExpression": "SUM(population)/SUM(area)",
+            },
+        },
+        "js_columns": [],
+        "js_data_mutator": "",
+        "js_tooltip": "",
+        "js_onclick_href": "",
+        "legend_format": ".1s",
+        "legend_position": "tr",
+    }
+
+    print("Creating Polygon slice")
+    slc = Slice(
+        slice_name="Polygons",
+        viz_type="deck_polygon",
+        datasource_type="table",
+        datasource_id=polygon_tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "datasource": "10__table",
+        "viz_type": "deck_arc",
+        "slice_id": 42,
+        "granularity_sqla": None,
+        "time_grain_sqla": None,
+        "time_range": " : ",
+        "start_spatial": {
+            "type": "latlong",
+            "latCol": "LATITUDE",
+            "lonCol": "LONGITUDE",
+        },
+        "end_spatial": {
+            "type": "latlong",
+            "latCol": "LATITUDE_DEST",
+            "lonCol": "LONGITUDE_DEST",
+        },
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "viewport": {
+            "altitude": 1.5,
+            "bearing": 8.546256357301871,
+            "height": 642,
+            "latitude": 44.596651438714254,
+            "longitude": -91.84340711201104,
+            "maxLatitude": 85.05113,
+            "maxPitch": 60,
+            "maxZoom": 20,
+            "minLatitude": -85.05113,
+            "minPitch": 0,
+            "minZoom": 0,
+            "pitch": 60,
+            "width": 997,
+            "zoom": 2.929837070560775,
+        },
+        "color_picker": {"r": 0, "g": 122, "b": 135, "a": 1},
+        "stroke_width": 1,
+    }
+
+    print("Creating Arc slice")
+    slc = Slice(
+        slice_name="Arcs",
+        viz_type="deck_arc",
+        datasource_type="table",
+        datasource_id=db.session.query(TBL).filter_by(table_name="flights").first().id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+
+    slice_data = {
+        "datasource": "12__table",
+        "slice_id": 43,
+        "viz_type": "deck_path",
+        "time_grain_sqla": None,
+        "time_range": " : ",
+        "line_column": "path_json",
+        "line_type": "json",
+        "row_limit": 5000,
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "viewport": {
+            "longitude": -122.18885402582598,
+            "latitude": 37.73671752604488,
+            "zoom": 9.51847667620428,
+            "bearing": 0,
+            "pitch": 0,
+            "width": 669,
+            "height": 1094,
+            "altitude": 1.5,
+            "maxZoom": 20,
+            "minZoom": 0,
+            "maxPitch": 60,
+            "minPitch": 0,
+            "maxLatitude": 85.05113,
+            "minLatitude": -85.05113,
+        },
+        "color_picker": {"r": 0, "g": 122, "b": 135, "a": 1},
+        "line_width": 150,
+        "reverse_long_lat": False,
+        "js_columns": ["color"],
+        "js_data_mutator": "data => data.map(d => ({\n"
+        "    ...d,\n"
+        "    color: colors.hexToRGB(d.extraProps.color)\n"
+        "}));",
+        "js_tooltip": "",
+        "js_onclick_href": "",
+    }
+
+    print("Creating Path slice")
+    slc = Slice(
+        slice_name="Path",
+        viz_type="deck_path",
+        datasource_type="table",
+        datasource_id=db.session.query(TBL)
+        .filter_by(table_name="bart_lines")
+        .first()
+        .id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+    slices.append(slc)
+    slug = "deck"
+
+    print("Creating a dashboard")
+    title = "deck.gl Demo"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+    dash.published = True
+    js = POSITION_JSON
+    pos = json.loads(js)
+    update_slice_ids(pos, slices)
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.dashboard_title = title
+    dash.slug = slug
+    dash.slices = slices
+    db.session.merge(dash)
+    db.session.commit()
+
+
+if __name__ == "__main__":
+    load_deck_dash()

+ 141 - 0
data/purposeCombined/BI/examples/energy.py

@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import textwrap
+
+import pandas as pd
+from sqlalchemy import Float, String
+from sqlalchemy.sql import column
+
+from superset import db
+from superset.connectors.sqla.models import SqlMetric
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import get_example_data, merge_slice, misc_dash_slices, TBL
+
+
+def load_energy(only_metadata: bool = False, force: bool = False) -> None:
+    """Loads an energy related dataset to use with sankey and graphs"""
+    tbl_name = "energy_usage"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("energy.json.gz")
+        pdf = pd.read_json(data)
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={"source": String(255), "target": String(255), "value": Float()},
+            index=False,
+        )
+
+    print("Creating table [wb_health_population] reference")
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Energy consumption"
+    tbl.database = database
+
+    if not any(col.metric_name == "sum__value" for col in tbl.metrics):
+        col = str(column("value").compile(db.engine))
+        tbl.metrics.append(
+            SqlMetric(metric_name="sum__value", expression=f"SUM({col})")
+        )
+
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()
+
+    slc = Slice(
+        slice_name="Energy Sankey",
+        viz_type="sankey",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=textwrap.dedent(
+            """\
+        {
+            "collapsed_fieldsets": "",
+            "groupby": [
+                "source",
+                "target"
+            ],
+            "metric": "sum__value",
+            "row_limit": "5000",
+            "slice_name": "Energy Sankey",
+            "viz_type": "sankey"
+        }
+        """
+        ),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)
+
+    slc = Slice(
+        slice_name="Energy Force Layout",
+        viz_type="directed_force",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=textwrap.dedent(
+            """\
+        {
+            "charge": "-500",
+            "collapsed_fieldsets": "",
+            "groupby": [
+                "source",
+                "target"
+            ],
+            "link_length": "200",
+            "metric": "sum__value",
+            "row_limit": "5000",
+            "slice_name": "Force",
+            "viz_type": "directed_force"
+        }
+        """
+        ),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)
+
+    slc = Slice(
+        slice_name="Heatmap",
+        viz_type="heatmap",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=textwrap.dedent(
+            """\
+        {
+            "all_columns_x": "source",
+            "all_columns_y": "target",
+            "canvas_image_rendering": "pixelated",
+            "collapsed_fieldsets": "",
+            "linear_color_scheme": "blue_white_yellow",
+            "metric": "sum__value",
+            "normalize_across": "heatmap",
+            "slice_name": "Heatmap",
+            "viz_type": "heatmap",
+            "xscale_interval": "1",
+            "yscale_interval": "1"
+        }
+        """
+        ),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 68 - 0
data/purposeCombined/BI/examples/flights.py

@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pandas as pd
+from sqlalchemy import DateTime
+
+from superset import db
+from superset.utils import core as utils
+
+from .helpers import get_example_data, TBL
+
+
+def load_flights(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading random time series data from a zip file in the repo"""
+    tbl_name = "flights"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("flight_data.csv.gz", make_bytes=True)
+        pdf = pd.read_csv(data, encoding="latin-1")
+
+        # Loading airports info to join and get lat/long
+        airports_bytes = get_example_data("airports.csv.gz", make_bytes=True)
+        airports = pd.read_csv(airports_bytes, encoding="latin-1")
+        airports = airports.set_index("IATA_CODE")
+
+        pdf["ds"] = (
+            pdf.YEAR.map(str) + "-0" + pdf.MONTH.map(str) + "-0" + pdf.DAY.map(str)
+        )
+        pdf.ds = pd.to_datetime(pdf.ds)
+        del pdf["YEAR"]
+        del pdf["MONTH"]
+        del pdf["DAY"]
+
+        pdf = pdf.join(airports, on="ORIGIN_AIRPORT", rsuffix="_ORIG")
+        pdf = pdf.join(airports, on="DESTINATION_AIRPORT", rsuffix="_DEST")
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={"ds": DateTime},
+            index=False,
+        )
+
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Random set of flights in the US"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()
+    print("Done loading table!")

+ 78 - 0
data/purposeCombined/BI/examples/helpers-backup.py

@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import os
+import zlib
+from io import BytesIO
+from typing import Any, Dict, List, Set
+from urllib import request
+
+from superset import app, db
+from superset.connectors.connector_registry import ConnectorRegistry
+from superset.models import core as models
+from superset.models.slice import Slice
+
+BASE_URL = "https://github.com/apache-superset/examples-data/blob/master/"
+
+# Shortcuts
+DB = models.Database
+
+TBL = ConnectorRegistry.sources["table"]
+
+config = app.config
+
+EXAMPLES_FOLDER = os.path.join(config["BASE_DIR"], "examples")
+
+misc_dash_slices: Set[str] = set()  # slices assembled in a 'Misc Chart' dashboard
+
+
+def update_slice_ids(layout_dict: Dict[Any, Any], slices: List[Slice]) -> None:
+    charts = [
+        component
+        for component in layout_dict.values()
+        if isinstance(component, dict) and component["type"] == "CHART"
+    ]
+    sorted_charts = sorted(charts, key= k["meta"]["chartId"])
+    for i, chart_component in enumerate(sorted_charts):
+        if i < len(slices):
+            chart_component["meta"]["chartId"] = int(slices[i].id)
+
+
+def merge_slice(slc: Slice) -> None:
+    o = db.session.query(Slice).filter_by(slice_name=slc.slice_name).first()
+    if o:
+        db.session.delete(o)
+    db.session.add(slc)
+    db.session.commit()
+
+
+def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
+    defaults_copy = defaults.copy()
+    defaults_copy.update(kwargs)
+    return json.dumps(defaults_copy, indent=4, sort_keys=True)
+
+
+def get_example_data(
+    filepath: str, is_gzip: bool = True, make_bytes: bool = False
+) -> BytesIO:
+    content = request.urlopen(f"{BASE_URL}{filepath}?raw=true").read()
+    if is_gzip:
+        content = zlib.decompress(content, zlib.MAX_WBITS | 16)
+    if make_bytes:
+        content = BytesIO(content)
+    return content

+ 78 - 0
data/purposeCombined/BI/examples/helpers.py

@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import os
+import zlib
+from io import BytesIO
+from typing import Any, Dict, List, Set
+from urllib import request
+
+from superset import app, db
+from superset.connectors.connector_registry import ConnectorRegistry
+from superset.models import core as models
+from superset.models.slice import Slice
+
+BASE_URL = "https://github.com/apache-superset/examples-data/blob/master/"
+
+# Shortcuts
+DB = models.Database
+
+TBL = ConnectorRegistry.sources["table"]
+
+config = app.config
+
+EXAMPLES_FOLDER = os.path.join(config["BASE_DIR"], "examples")
+
+misc_dash_slices: Set[str] = set()  # slices assembled in a 'Misc Chart' dashboard
+
+
+def update_slice_ids(layout_dict: Dict[Any, Any], slices: List[Slice]) -> None:
+    charts = [
+        component
+        for component in layout_dict.values()
+        if isinstance(component, dict) and component["type"] == "CHART"
+    ]
+    sorted_charts = sorted(charts, key=lambda k: k["meta"]["chartId"])
+    for i, chart_component in enumerate(sorted_charts):
+        if i < len(slices):
+            chart_component["meta"]["chartId"] = int(slices[i].id)
+
+
+def merge_slice(slc: Slice) -> None:
+    o = db.session.query(Slice).filter_by(slice_name=slc.slice_name).first()
+    if o:
+        db.session.delete(o)
+    db.session.add(slc)
+    db.session.commit()
+
+
+def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
+    defaults_copy = defaults.copy()
+    defaults_copy.update(kwargs)
+    return json.dumps(defaults_copy, indent=4, sort_keys=True)
+
+
+def get_example_data(
+    filepath: str, is_gzip: bool = True, make_bytes: bool = False
+) -> BytesIO:
+    content = request.urlopen(f"{BASE_URL}{filepath}?raw=true").read()
+    if is_gzip:
+        content = zlib.decompress(content, zlib.MAX_WBITS | 16)
+    if make_bytes:
+        content = BytesIO(content)
+    return content

+ 116 - 0
data/purposeCombined/BI/examples/long_lat.py

@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import datetime
+import random
+
+import geohash
+import pandas as pd
+from sqlalchemy import DateTime, Float, String
+
+from superset import db
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+)
+
+
+def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading lat/long data from a csv file in the repo"""
+    tbl_name = "long_lat"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("san_francisco.csv.gz", make_bytes=True)
+        pdf = pd.read_csv(data, encoding="utf-8")
+        start = datetime.datetime.now().replace(
+            hour=0, minute=0, second=0, microsecond=0
+        )
+        pdf["datetime"] = [
+            start + datetime.timedelta(hours=i * 24 / (len(pdf) - 1))
+            for i in range(len(pdf))
+        ]
+        pdf["occupancy"] = [random.randint(1, 6) for _ in range(len(pdf))]
+        pdf["radius_miles"] = [random.uniform(1, 3) for _ in range(len(pdf))]
+        pdf["geohash"] = pdf[["LAT", "LON"]].apply(lambda x: geohash.encode(*x), axis=1)
+        pdf["delimited"] = pdf["LAT"].map(str).str.cat(pdf["LON"].map(str), sep=",")
+        pdf.to_sql(  # pylint: disable=no-member
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "longitude": Float(),
+                "latitude": Float(),
+                "number": Float(),
+                "street": String(100),
+                "unit": String(10),
+                "city": String(50),
+                "district": String(50),
+                "region": String(50),
+                "postcode": Float(),
+                "id": String(100),
+                "datetime": DateTime(),
+                "occupancy": Float(),
+                "radius_miles": Float(),
+                "geohash": String(12),
+                "delimited": String(60),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print("Creating table reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "datetime"
+    obj.database = database
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "day",
+        "since": "2014-01-01",
+        "until": "now",
+        "viz_type": "mapbox",
+        "all_columns_x": "LON",
+        "all_columns_y": "LAT",
+        "mapbox_style": "mapbox://styles/mapbox/light-v9",
+        "all_columns": ["occupancy"],
+        "row_limit": 500000,
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Mapbox Long/Lat",
+        viz_type="mapbox",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 224 - 0
data/purposeCombined/BI/examples/misc_dashboard-backup.py

@@ -0,0 +1,224 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import misc_dash_slices, update_slice_ids
+
+DASH_SLUG = "misc_charts"
+
+
+def load_misc_dashboard() -> None:
+    """Loading a dashboard featuring misc charts"""
+
+    print("Creating the dashboard")
+    db.session.expunge_all()
+    dash = db.session.query(Dashboard).filter_by(slug=DASH_SLUG).first()
+
+    if not dash:
+        dash = Dashboard()
+    js = textwrap.dedent(
+        """\
+{
+    "CHART-BkeVbh8ANQ": {
+        "children": [],
+        "id": "CHART-BkeVbh8ANQ",
+        "meta": {
+            "chartId": 4004,
+            "height": 34,
+            "sliceName": "Multi Line",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-H1HYNzEANX": {
+        "children": [],
+        "id": "CHART-H1HYNzEANX",
+        "meta": {
+            "chartId": 3940,
+            "height": 50,
+            "sliceName": "Energy Sankey",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-HJOYVMV0E7": {
+        "children": [],
+        "id": "CHART-HJOYVMV0E7",
+        "meta": {
+            "chartId": 3969,
+            "height": 63,
+            "sliceName": "Mapbox Long/Lat",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-S1WYNz4AVX": {
+        "children": [],
+        "id": "CHART-S1WYNz4AVX",
+        "meta": {
+            "chartId": 3989,
+            "height": 25,
+            "sliceName": "Parallel Coordinates",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-r19KVMNCE7": {
+        "children": [],
+        "id": "CHART-r19KVMNCE7",
+        "meta": {
+            "chartId": 3971,
+            "height": 34,
+            "sliceName": "Calendar Heatmap multiformat 0",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-rJ4K4GV04Q": {
+        "children": [],
+        "id": "CHART-rJ4K4GV04Q",
+        "meta": {
+            "chartId": 3941,
+            "height": 63,
+            "sliceName": "Energy Force Layout",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-rkgF4G4A4X": {
+        "children": [],
+        "id": "CHART-rkgF4G4A4X",
+        "meta": {
+            "chartId": 3970,
+            "height": 25,
+            "sliceName": "Birth in France by department in 2016",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-rywK4GVR4X": {
+        "children": [],
+        "id": "CHART-rywK4GVR4X",
+        "meta": {
+            "chartId": 3942,
+            "height": 50,
+            "sliceName": "Heatmap",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "COLUMN-ByUFVf40EQ": {
+        "children": [
+            "CHART-rywK4GVR4X",
+            "CHART-HJOYVMV0E7"
+        ],
+        "id": "COLUMN-ByUFVf40EQ",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "COLUMN-rkmYVGN04Q": {
+        "children": [
+            "CHART-rJ4K4GV04Q",
+            "CHART-H1HYNzEANX"
+        ],
+        "id": "COLUMN-rkmYVGN04Q",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-SytNzNA4X",
+            "ROW-S1MK4M4A4X",
+            "ROW-HkFFEzVRVm"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+            "text": "Misc Charts"
+        },
+        "type": "HEADER"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-HkFFEzVRVm": {
+        "children": [
+            "CHART-r19KVMNCE7",
+            "CHART-BkeVbh8ANQ"
+        ],
+        "id": "ROW-HkFFEzVRVm",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-S1MK4M4A4X": {
+        "children": [
+            "COLUMN-rkmYVGN04Q",
+            "COLUMN-ByUFVf40EQ"
+        ],
+        "id": "ROW-S1MK4M4A4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-SytNzNA4X": {
+        "children": [
+            "CHART-rkgF4G4A4X",
+            "CHART-S1WYNz4AVX"
+        ],
+        "id": "ROW-SytNzNA4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    )
+    pos = json.loads(js)
+    slices = (
+        db.session.query(Slice).filter(Slice.slice_name.in_(misc_dash_slices)).all()
+    )
+    slices = sorted(slices, key= x.id)
+    update_slice_ids(pos, slices)
+    dash.dashboard_title = "Misc Charts"
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = DASH_SLUG
+    dash.slices = slices
+    db.session.merge(dash)
+    db.session.commit()

+ 224 - 0
data/purposeCombined/BI/examples/misc_dashboard.py

@@ -0,0 +1,224 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import misc_dash_slices, update_slice_ids
+
+DASH_SLUG = "misc_charts"
+
+
+def load_misc_dashboard() -> None:
+    """Loading a dashboard featuring misc charts"""
+
+    print("Creating the dashboard")
+    db.session.expunge_all()
+    dash = db.session.query(Dashboard).filter_by(slug=DASH_SLUG).first()
+
+    if not dash:
+        dash = Dashboard()
+    js = textwrap.dedent(
+        """\
+{
+    "CHART-BkeVbh8ANQ": {
+        "children": [],
+        "id": "CHART-BkeVbh8ANQ",
+        "meta": {
+            "chartId": 4004,
+            "height": 34,
+            "sliceName": "Multi Line",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-H1HYNzEANX": {
+        "children": [],
+        "id": "CHART-H1HYNzEANX",
+        "meta": {
+            "chartId": 3940,
+            "height": 50,
+            "sliceName": "Energy Sankey",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-HJOYVMV0E7": {
+        "children": [],
+        "id": "CHART-HJOYVMV0E7",
+        "meta": {
+            "chartId": 3969,
+            "height": 63,
+            "sliceName": "Mapbox Long/Lat",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-S1WYNz4AVX": {
+        "children": [],
+        "id": "CHART-S1WYNz4AVX",
+        "meta": {
+            "chartId": 3989,
+            "height": 25,
+            "sliceName": "Parallel Coordinates",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-r19KVMNCE7": {
+        "children": [],
+        "id": "CHART-r19KVMNCE7",
+        "meta": {
+            "chartId": 3971,
+            "height": 34,
+            "sliceName": "Calendar Heatmap multiformat 0",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-rJ4K4GV04Q": {
+        "children": [],
+        "id": "CHART-rJ4K4GV04Q",
+        "meta": {
+            "chartId": 3941,
+            "height": 63,
+            "sliceName": "Energy Force Layout",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-rkgF4G4A4X": {
+        "children": [],
+        "id": "CHART-rkgF4G4A4X",
+        "meta": {
+            "chartId": 3970,
+            "height": 25,
+            "sliceName": "Birth in France by department in 2016",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-rywK4GVR4X": {
+        "children": [],
+        "id": "CHART-rywK4GVR4X",
+        "meta": {
+            "chartId": 3942,
+            "height": 50,
+            "sliceName": "Heatmap",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "COLUMN-ByUFVf40EQ": {
+        "children": [
+            "CHART-rywK4GVR4X",
+            "CHART-HJOYVMV0E7"
+        ],
+        "id": "COLUMN-ByUFVf40EQ",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "COLUMN-rkmYVGN04Q": {
+        "children": [
+            "CHART-rJ4K4GV04Q",
+            "CHART-H1HYNzEANX"
+        ],
+        "id": "COLUMN-rkmYVGN04Q",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 6
+        },
+        "type": "COLUMN"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-SytNzNA4X",
+            "ROW-S1MK4M4A4X",
+            "ROW-HkFFEzVRVm"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+            "text": "Misc Charts"
+        },
+        "type": "HEADER"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-HkFFEzVRVm": {
+        "children": [
+            "CHART-r19KVMNCE7",
+            "CHART-BkeVbh8ANQ"
+        ],
+        "id": "ROW-HkFFEzVRVm",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-S1MK4M4A4X": {
+        "children": [
+            "COLUMN-rkmYVGN04Q",
+            "COLUMN-ByUFVf40EQ"
+        ],
+        "id": "ROW-S1MK4M4A4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-SytNzNA4X": {
+        "children": [
+            "CHART-rkgF4G4A4X",
+            "CHART-S1WYNz4AVX"
+        ],
+        "id": "ROW-SytNzNA4X",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    )
+    pos = json.loads(js)
+    slices = (
+        db.session.query(Slice).filter(Slice.slice_name.in_(misc_dash_slices)).all()
+    )
+    slices = sorted(slices, key=lambda x: x.id)
+    update_slice_ids(pos, slices)
+    dash.dashboard_title = "Misc Charts"
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = DASH_SLUG
+    dash.slices = slices
+    db.session.merge(dash)
+    db.session.commit()

+ 58 - 0
data/purposeCombined/BI/examples/multi_line.py

@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+from superset import db
+from superset.models.slice import Slice
+
+from .birth_names import load_birth_names
+from .helpers import merge_slice, misc_dash_slices
+from .world_bank import load_world_bank_health_n_pop
+
+
+def load_multi_line(only_metadata: bool = False) -> None:
+    load_world_bank_health_n_pop(only_metadata)
+    load_birth_names(only_metadata)
+    ids = [
+        row.id
+        for row in db.session.query(Slice).filter(
+            Slice.slice_name.in_(["Growth Rate", "Trends"])
+        )
+    ]
+
+    slc = Slice(
+        datasource_type="table",  # not true, but needed
+        datasource_id=1,  # cannot be empty
+        slice_name="Multi Line",
+        viz_type="line_multi",
+        params=json.dumps(
+            {
+                "slice_name": "Multi Line",
+                "viz_type": "line_multi",
+                "line_charts": [ids[0]],
+                "line_charts_2": [ids[1]],
+                "since": "1970",
+                "until": "1995",
+                "prefix_metric_with_slice_name": True,
+                "show_legend": False,
+                "x_axis_format": "%Y",
+            }
+        ),
+    )
+
+    misc_dash_slices.add(slc.slice_name)
+    merge_slice(slc)

+ 117 - 0
data/purposeCombined/BI/examples/multiformat_time_series.py

@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import Dict, Optional, Tuple
+
+import pandas as pd
+from sqlalchemy import BigInteger, Date, DateTime, String
+
+from superset import db
+from superset.models.slice import Slice
+from superset.utils.core import get_example_database
+
+from .helpers import (
+    config,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+)
+
+
+def load_multiformat_time_series(
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    """Loading time series data from a zip file in the repo"""
+    tbl_name = "multiformat_time_series"
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("multiformat_time_series.json.gz")
+        pdf = pd.read_json(data)
+
+        pdf.ds = pd.to_datetime(pdf.ds, unit="s")
+        pdf.ds2 = pd.to_datetime(pdf.ds2, unit="s")
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "ds": Date,
+                "ds2": DateTime,
+                "epoch_s": BigInteger,
+                "epoch_ms": BigInteger,
+                "string0": String(100),
+                "string1": String(100),
+                "string2": String(100),
+                "string3": String(100),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print(f"Creating table [{tbl_name}] reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "ds"
+    obj.database = database
+    dttm_and_expr_dict: Dict[str, Tuple[Optional[str], None]] = {
+        "ds": (None, None),
+        "ds2": (None, None),
+        "epoch_s": ("epoch_s", None),
+        "epoch_ms": ("epoch_ms", None),
+        "string2": ("%Y%m%d-%H%M%S", None),
+        "string1": ("%Y-%m-%d^%H:%M:%S", None),
+        "string0": ("%Y-%m-%d %H:%M:%S.%f", None),
+        "string3": ("%Y/%m/%d%H:%M:%S.%f", None),
+    }
+    for col in obj.columns:
+        dttm_and_expr = dttm_and_expr_dict[col.column_name]
+        col.python_date_format = dttm_and_expr[0]
+        col.dbatabase_expr = dttm_and_expr[1]
+        col.is_dttm = True
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    print("Creating Heatmap charts")
+    for i, col in enumerate(tbl.columns):
+        slice_data = {
+            "metrics": ["count"],
+            "granularity_sqla": col.column_name,
+            "row_limit": config["ROW_LIMIT"],
+            "since": "2015",
+            "until": "2016",
+            "viz_type": "cal_heatmap",
+            "domain_granularity": "month",
+            "subdomain_granularity": "day",
+        }
+
+        slc = Slice(
+            slice_name=f"Calendar Heatmap multiformat {i}",
+            viz_type="cal_heatmap",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(slice_data),
+        )
+        merge_slice(slc)
+    misc_dash_slices.add("Calendar Heatmap multiformat 0")

+ 60 - 0
data/purposeCombined/BI/examples/paris.py

@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+import pandas as pd
+from sqlalchemy import String, Text
+
+from superset import db
+from superset.utils import core as utils
+
+from .helpers import get_example_data, TBL
+
+
+def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> None:
+    tbl_name = "paris_iris_mapping"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("paris_iris.json.gz")
+        df = pd.read_json(data)
+        df["features"] = df.features.map(json.dumps)
+
+        df.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "color": String(255),
+                "name": String(255),
+                "features": Text,
+                "type": Text,
+            },
+            index=False,
+        )
+
+    print("Creating table {} reference".format(tbl_name))
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Map of Paris"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()

+ 81 - 0
data/purposeCombined/BI/examples/random_time_series.py

@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pandas as pd
+from sqlalchemy import DateTime
+
+from superset import db
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import config, get_example_data, get_slice_json, merge_slice, TBL
+
+
+def load_random_time_series_data(
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    """Loading random time series data from a zip file in the repo"""
+    tbl_name = "random_time_series"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("random_time_series.json.gz")
+        pdf = pd.read_json(data)
+        pdf.ds = pd.to_datetime(pdf.ds, unit="s")
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={"ds": DateTime},
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print(f"Creating table [{tbl_name}] reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "ds"
+    obj.database = database
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "day",
+        "row_limit": config["ROW_LIMIT"],
+        "since": "2019-01-01",
+        "until": "2019-02-01",
+        "metric": "count",
+        "viz_type": "cal_heatmap",
+        "domain_granularity": "month",
+        "subdomain_granularity": "day",
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Calendar Heatmap",
+        viz_type="cal_heatmap",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)

+ 62 - 0
data/purposeCombined/BI/examples/sf_population_polygons.py

@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+
+import pandas as pd
+from sqlalchemy import BigInteger, Float, Text
+
+from superset import db
+from superset.utils import core as utils
+
+from .helpers import get_example_data, TBL
+
+
+def load_sf_population_polygons(
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    tbl_name = "sf_population_polygons"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("sf_population.json.gz")
+        df = pd.read_json(data)
+        df["contour"] = df.contour.map(json.dumps)
+
+        df.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "zipcode": BigInteger,
+                "population": BigInteger,
+                "contour": Text,
+                "area": Float,
+            },
+            index=False,
+        )
+
+    print("Creating table {} reference".format(tbl_name))
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = "Population density of San Francisco"
+    tbl.database = database
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()

+ 342 - 0
data/purposeCombined/BI/examples/tabbed_dashboard-backup.py

@@ -0,0 +1,342 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import update_slice_ids
+
+
+def load_tabbed_dashboard(_: bool = False) -> None:
+    """Creating a tabbed dashboard"""
+
+    print("Creating a dashboard with nested tabs")
+    slug = "tabbed_dash"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+
+    # reuse charts in "World's Bank Data and create
+    # new dashboard with nested tabs
+    tabbed_dash_slices = set()
+    tabbed_dash_slices.add("Region Filter")
+    tabbed_dash_slices.add("Growth Rate")
+    tabbed_dash_slices.add("Treemap")
+    tabbed_dash_slices.add("Box plot")
+
+    js = textwrap.dedent(
+        """\
+    {
+      "CHART-c0EjR-OZ0n": {
+        "children": [],
+        "id": "CHART-c0EjR-OZ0n",
+        "meta": {
+          "chartId": 870,
+          "height": 50,
+          "sliceName": "Box plot",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "ROW-7G2o5uDvfo"
+        ],
+        "type": "CHART"
+      },
+      "CHART-dxV7Il74hH": {
+        "children": [],
+        "id": "CHART-dxV7Il74hH",
+        "meta": {
+          "chartId": 797,
+          "height": 50,
+          "sliceName": "Treemap",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1",
+          "ROW-7ygtDczaQ"
+        ],
+        "type": "CHART"
+      },
+      "CHART-jJ5Yj1Ptaz": {
+        "children": [],
+        "id": "CHART-jJ5Yj1Ptaz",
+        "meta": {
+          "chartId": 789,
+          "height": 50,
+          "sliceName": "World's Population",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7",
+          "ROW-G73z9PIHn"
+        ],
+        "type": "CHART"
+      },
+      "CHART-z4gmEuCqQ5": {
+        "children": [],
+        "id": "CHART-z4gmEuCqQ5",
+        "meta": {
+          "chartId": 788,
+          "height": 50,
+          "sliceName": "Region Filter",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922",
+          "ROW-LCjsdSetJ"
+        ],
+        "type": "CHART"
+      },
+      "DASHBOARD_VERSION_KEY": "v2",
+      "GRID_ID": {
+        "children": [],
+        "id": "GRID_ID",
+        "type": "GRID"
+      },
+      "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+          "text": "Tabbed Dashboard"
+        },
+        "type": "HEADER"
+      },
+      "ROOT_ID": {
+        "children": [
+          "TABS-lV0r00f4H1"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+      },
+      "ROW-7G2o5uDvfo": {
+        "children": [
+          "CHART-c0EjR-OZ0n"
+        ],
+        "id": "ROW-7G2o5uDvfo",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "ROW"
+      },
+      "ROW-7ygtDczaQ": {
+        "children": [
+          "CHART-dxV7Il74hH"
+        ],
+        "id": "ROW-7ygtDczaQ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1"
+        ],
+        "type": "ROW"
+      },
+      "ROW-G73z9PIHn": {
+        "children": [
+          "CHART-jJ5Yj1Ptaz"
+        ],
+        "id": "ROW-G73z9PIHn",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7"
+        ],
+        "type": "ROW"
+      },
+      "ROW-LCjsdSetJ": {
+        "children": [
+          "CHART-z4gmEuCqQ5"
+        ],
+        "id": "ROW-LCjsdSetJ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922"
+        ],
+        "type": "ROW"
+      },
+      "TAB-EcNm_wh922": {
+        "children": [
+          "ROW-LCjsdSetJ"
+        ],
+        "id": "TAB-EcNm_wh922",
+        "meta": {
+          "text": "row tab 1"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TAB-NF3dlrWGS": {
+        "children": [
+          "ROW-7G2o5uDvfo",
+          "TABS-CSjo6VfNrj"
+        ],
+        "id": "TAB-NF3dlrWGS",
+        "meta": {
+          "text": "Tab A"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-gcQJxApOZS": {
+        "children": [
+          "TABS-afnrUvdxYF"
+        ],
+        "id": "TAB-gcQJxApOZS",
+        "meta": {
+          "text": "Tab B"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-jNNd4WWar1": {
+        "children": [
+          "ROW-7ygtDczaQ"
+        ],
+        "id": "TAB-jNNd4WWar1",
+        "meta": {
+          "text": "New Tab"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF"
+        ],
+        "type": "TAB"
+      },
+      "TAB-z81Q87PD7": {
+        "children": [
+          "ROW-G73z9PIHn"
+        ],
+        "id": "TAB-z81Q87PD7",
+        "meta": {
+          "text": "row tab 2"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TABS-CSjo6VfNrj": {
+        "children": [
+          "TAB-EcNm_wh922",
+          "TAB-z81Q87PD7"
+        ],
+        "id": "TABS-CSjo6VfNrj",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-afnrUvdxYF": {
+        "children": [
+          "TAB-jNNd4WWar1"
+        ],
+        "id": "TABS-afnrUvdxYF",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-lV0r00f4H1": {
+        "children": [
+          "TAB-NF3dlrWGS",
+          "TAB-gcQJxApOZS"
+        ],
+        "id": "TABS-lV0r00f4H1",
+        "meta": {},
+        "parents": [
+          "ROOT_ID"
+        ],
+        "type": "TABS"
+      }
+    }
+        """
+    )
+    pos = json.loads(js)
+    slices = [
+        db.session.query(Slice).filter_by(slice_name=name).first()
+        for name in tabbed_dash_slices
+    ]
+
+    slices = sorted(slices, key= x.id)
+    update_slice_ids(pos, slices)
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slices = slices
+    dash.dashboard_title = "Tabbed Dashboard"
+    dash.slug = slug
+
+    db.session.merge(dash)
+    db.session.commit()

+ 342 - 0
data/purposeCombined/BI/examples/tabbed_dashboard.py

@@ -0,0 +1,342 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import textwrap
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+
+from .helpers import update_slice_ids
+
+
+def load_tabbed_dashboard(_: bool = False) -> None:
+    """Creating a tabbed dashboard"""
+
+    print("Creating a dashboard with nested tabs")
+    slug = "tabbed_dash"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+
+    # reuse charts in "World's Bank Data and create
+    # new dashboard with nested tabs
+    tabbed_dash_slices = set()
+    tabbed_dash_slices.add("Region Filter")
+    tabbed_dash_slices.add("Growth Rate")
+    tabbed_dash_slices.add("Treemap")
+    tabbed_dash_slices.add("Box plot")
+
+    js = textwrap.dedent(
+        """\
+    {
+      "CHART-c0EjR-OZ0n": {
+        "children": [],
+        "id": "CHART-c0EjR-OZ0n",
+        "meta": {
+          "chartId": 870,
+          "height": 50,
+          "sliceName": "Box plot",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "ROW-7G2o5uDvfo"
+        ],
+        "type": "CHART"
+      },
+      "CHART-dxV7Il74hH": {
+        "children": [],
+        "id": "CHART-dxV7Il74hH",
+        "meta": {
+          "chartId": 797,
+          "height": 50,
+          "sliceName": "Treemap",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1",
+          "ROW-7ygtDczaQ"
+        ],
+        "type": "CHART"
+      },
+      "CHART-jJ5Yj1Ptaz": {
+        "children": [],
+        "id": "CHART-jJ5Yj1Ptaz",
+        "meta": {
+          "chartId": 789,
+          "height": 50,
+          "sliceName": "World's Population",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7",
+          "ROW-G73z9PIHn"
+        ],
+        "type": "CHART"
+      },
+      "CHART-z4gmEuCqQ5": {
+        "children": [],
+        "id": "CHART-z4gmEuCqQ5",
+        "meta": {
+          "chartId": 788,
+          "height": 50,
+          "sliceName": "Region Filter",
+          "width": 4
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922",
+          "ROW-LCjsdSetJ"
+        ],
+        "type": "CHART"
+      },
+      "DASHBOARD_VERSION_KEY": "v2",
+      "GRID_ID": {
+        "children": [],
+        "id": "GRID_ID",
+        "type": "GRID"
+      },
+      "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+          "text": "Tabbed Dashboard"
+        },
+        "type": "HEADER"
+      },
+      "ROOT_ID": {
+        "children": [
+          "TABS-lV0r00f4H1"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+      },
+      "ROW-7G2o5uDvfo": {
+        "children": [
+          "CHART-c0EjR-OZ0n"
+        ],
+        "id": "ROW-7G2o5uDvfo",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "ROW"
+      },
+      "ROW-7ygtDczaQ": {
+        "children": [
+          "CHART-dxV7Il74hH"
+        ],
+        "id": "ROW-7ygtDczaQ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF",
+          "TAB-jNNd4WWar1"
+        ],
+        "type": "ROW"
+      },
+      "ROW-G73z9PIHn": {
+        "children": [
+          "CHART-jJ5Yj1Ptaz"
+        ],
+        "id": "ROW-G73z9PIHn",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-z81Q87PD7"
+        ],
+        "type": "ROW"
+      },
+      "ROW-LCjsdSetJ": {
+        "children": [
+          "CHART-z4gmEuCqQ5"
+        ],
+        "id": "ROW-LCjsdSetJ",
+        "meta": {
+          "background": "BACKGROUND_TRANSPARENT"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj",
+          "TAB-EcNm_wh922"
+        ],
+        "type": "ROW"
+      },
+      "TAB-EcNm_wh922": {
+        "children": [
+          "ROW-LCjsdSetJ"
+        ],
+        "id": "TAB-EcNm_wh922",
+        "meta": {
+          "text": "row tab 1"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TAB-NF3dlrWGS": {
+        "children": [
+          "ROW-7G2o5uDvfo",
+          "TABS-CSjo6VfNrj"
+        ],
+        "id": "TAB-NF3dlrWGS",
+        "meta": {
+          "text": "Tab A"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-gcQJxApOZS": {
+        "children": [
+          "TABS-afnrUvdxYF"
+        ],
+        "id": "TAB-gcQJxApOZS",
+        "meta": {
+          "text": "Tab B"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1"
+        ],
+        "type": "TAB"
+      },
+      "TAB-jNNd4WWar1": {
+        "children": [
+          "ROW-7ygtDczaQ"
+        ],
+        "id": "TAB-jNNd4WWar1",
+        "meta": {
+          "text": "New Tab"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS",
+          "TABS-afnrUvdxYF"
+        ],
+        "type": "TAB"
+      },
+      "TAB-z81Q87PD7": {
+        "children": [
+          "ROW-G73z9PIHn"
+        ],
+        "id": "TAB-z81Q87PD7",
+        "meta": {
+          "text": "row tab 2"
+        },
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS",
+          "TABS-CSjo6VfNrj"
+        ],
+        "type": "TAB"
+      },
+      "TABS-CSjo6VfNrj": {
+        "children": [
+          "TAB-EcNm_wh922",
+          "TAB-z81Q87PD7"
+        ],
+        "id": "TABS-CSjo6VfNrj",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-NF3dlrWGS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-afnrUvdxYF": {
+        "children": [
+          "TAB-jNNd4WWar1"
+        ],
+        "id": "TABS-afnrUvdxYF",
+        "meta": {},
+        "parents": [
+          "ROOT_ID",
+          "TABS-lV0r00f4H1",
+          "TAB-gcQJxApOZS"
+        ],
+        "type": "TABS"
+      },
+      "TABS-lV0r00f4H1": {
+        "children": [
+          "TAB-NF3dlrWGS",
+          "TAB-gcQJxApOZS"
+        ],
+        "id": "TABS-lV0r00f4H1",
+        "meta": {},
+        "parents": [
+          "ROOT_ID"
+        ],
+        "type": "TABS"
+      }
+    }
+        """
+    )
+    pos = json.loads(js)
+    slices = [
+        db.session.query(Slice).filter_by(slice_name=name).first()
+        for name in tabbed_dash_slices
+    ]
+
+    slices = sorted(slices, key=lambda x: x.id)
+    update_slice_ids(pos, slices)
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slices = slices
+    dash.dashboard_title = "Tabbed Dashboard"
+    dash.slug = slug
+
+    db.session.merge(dash)
+    db.session.commit()

+ 163 - 0
data/purposeCombined/BI/examples/unicode_test_data.py

@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import datetime
+import json
+import random
+
+import pandas as pd
+from sqlalchemy import Date, Float, String
+
+from superset import db
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    config,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    TBL,
+    update_slice_ids,
+)
+
+
+def load_unicode_test_data(only_metadata: bool = False, force: bool = False) -> None:
+    """Loading unicode test dataset from a csv file in the repo"""
+    tbl_name = "unicode_test"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data(
+            "unicode_utf8_unixnl_test.csv", is_gzip=False, make_bytes=True
+        )
+        df = pd.read_csv(data, encoding="utf-8")
+        # generate date/numeric data
+        df["dttm"] = datetime.datetime.now().date()
+        df["value"] = [random.randint(1, 100) for _ in range(len(df))]
+        df.to_sql(  # pylint: disable=no-member
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "phrase": String(500),
+                "short_phrase": String(10),
+                "with_missing": String(100),
+                "dttm": Date(),
+                "value": Float(),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
+
+    print("Creating table [unicode_test] reference")
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not obj:
+        obj = TBL(table_name=tbl_name)
+    obj.main_dttm_col = "dttm"
+    obj.database = database
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    slice_data = {
+        "granularity_sqla": "dttm",
+        "groupby": [],
+        "metric": {
+            "aggregate": "SUM",
+            "column": {"column_name": "value"},
+            "expressionType": "SIMPLE",
+            "label": "Value",
+        },
+        "row_limit": config["ROW_LIMIT"],
+        "since": "100 years ago",
+        "until": "now",
+        "viz_type": "word_cloud",
+        "size_from": "10",
+        "series": "short_phrase",
+        "size_to": "70",
+        "rotation": "square",
+        "limit": "100",
+    }
+
+    print("Creating a slice")
+    slc = Slice(
+        slice_name="Unicode Cloud",
+        viz_type="word_cloud",
+        datasource_type="table",
+        datasource_id=tbl.id,
+        params=get_slice_json(slice_data),
+    )
+    merge_slice(slc)
+
+    print("Creating a dashboard")
+    dash = db.session.query(Dashboard).filter_by(slug="unicode-test").first()
+
+    if not dash:
+        dash = Dashboard()
+    js = """\
+{
+    "CHART-Hkx6154FEm": {
+        "children": [],
+        "id": "CHART-Hkx6154FEm",
+        "meta": {
+            "chartId": 2225,
+            "height": 30,
+            "sliceName": "slice 1",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-SyT19EFEQ"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-SyT19EFEQ": {
+        "children": [
+            "CHART-Hkx6154FEm"
+        ],
+        "id": "ROW-SyT19EFEQ",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    dash.dashboard_title = "Unicode Test"
+    pos = json.loads(js)
+    update_slice_ids(pos, [slc])
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = "unicode-test"
+    dash.slices = [slc]
+    db.session.merge(dash)
+    db.session.commit()

+ 574 - 0
data/purposeCombined/BI/examples/world_bank.py

@@ -0,0 +1,574 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Loads datasets, dashboards and slices in a new superset instance"""
+import json
+import os
+import textwrap
+
+import pandas as pd
+from sqlalchemy import DateTime, String
+from sqlalchemy.sql import column
+
+from superset import db
+from superset.connectors.sqla.models import SqlMetric
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.utils import core as utils
+
+from .helpers import (
+    config,
+    EXAMPLES_FOLDER,
+    get_example_data,
+    get_slice_json,
+    merge_slice,
+    misc_dash_slices,
+    TBL,
+    update_slice_ids,
+)
+
+
+def load_world_bank_health_n_pop(  # pylint: disable=too-many-locals
+    only_metadata: bool = False, force: bool = False
+) -> None:
+    """Loads the world bank health dataset, slices and a dashboard"""
+    tbl_name = "wb_health_population"
+    database = utils.get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        data = get_example_data("countries.json.gz")
+        pdf = pd.read_json(data)
+        pdf.columns = [col.replace(".", "_") for col in pdf.columns]
+        pdf.year = pd.to_datetime(pdf.year)
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=50,
+            dtype={
+                "year": DateTime(),
+                "country_code": String(3),
+                "country_name": String(255),
+                "region": String(255),
+            },
+            index=False,
+        )
+
+    print("Creating table [wb_health_population] reference")
+    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
+    if not tbl:
+        tbl = TBL(table_name=tbl_name)
+    tbl.description = utils.readfile(os.path.join(EXAMPLES_FOLDER, "countries.md"))
+    tbl.main_dttm_col = "year"
+    tbl.database = database
+    tbl.filter_select_enabled = True
+
+    metrics = [
+        "sum__SP_POP_TOTL",
+        "sum__SH_DYN_AIDS",
+        "sum__SH_DYN_AIDS",
+        "sum__SP_RUR_TOTL_ZS",
+        "sum__SP_DYN_LE00_IN",
+        "sum__SP_RUR_TOTL",
+    ]
+    for metric in metrics:
+        if not any(col.metric_name == metric for col in tbl.metrics):
+            aggr_func = metric[:3]
+            col = str(column(metric[5:]).compile(db.engine))
+            tbl.metrics.append(
+                SqlMetric(metric_name=metric, expression=f"{aggr_func}({col})")
+            )
+
+    db.session.merge(tbl)
+    db.session.commit()
+    tbl.fetch_metadata()
+
+    metric = "sum__SP_POP_TOTL"
+    metrics = ["sum__SP_POP_TOTL"]
+    secondary_metric = {
+        "aggregate": "SUM",
+        "column": {
+            "column_name": "SP_RUR_TOTL",
+            "optionName": "_col_SP_RUR_TOTL",
+            "type": "DOUBLE",
+        },
+        "expressionType": "SIMPLE",
+        "hasCustomLabel": True,
+        "label": "Rural Population",
+    }
+
+    defaults = {
+        "compare_lag": "10",
+        "compare_suffix": "o10Y",
+        "limit": "25",
+        "granularity_sqla": "year",
+        "groupby": [],
+        "row_limit": config["ROW_LIMIT"],
+        "since": "2014-01-01",
+        "until": "2014-01-02",
+        "time_range": "2014-01-01 : 2014-01-02",
+        "markup_type": "markdown",
+        "country_fieldtype": "cca3",
+        "entity": "country_code",
+        "show_bubbles": True,
+    }
+
+    print("Creating slices")
+    slices = [
+        Slice(
+            slice_name="Region Filter",
+            viz_type="filter_box",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="filter_box",
+                date_filter=False,
+                filter_configs=[
+                    {
+                        "asc": False,
+                        "clearable": True,
+                        "column": "region",
+                        "key": "2s98dfu",
+                        "metric": "sum__SP_POP_TOTL",
+                        "multiple": True,
+                    },
+                    {
+                        "asc": False,
+                        "clearable": True,
+                        "key": "li3j2lk",
+                        "column": "country_name",
+                        "metric": "sum__SP_POP_TOTL",
+                        "multiple": True,
+                    },
+                ],
+            ),
+        ),
+        Slice(
+            slice_name="World's Population",
+            viz_type="big_number",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="2000",
+                viz_type="big_number",
+                compare_lag="10",
+                metric="sum__SP_POP_TOTL",
+                compare_suffix="over 10Y",
+            ),
+        ),
+        Slice(
+            slice_name="Most Populated Countries",
+            viz_type="table",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="table",
+                metrics=["sum__SP_POP_TOTL"],
+                groupby=["country_name"],
+            ),
+        ),
+        Slice(
+            slice_name="Growth Rate",
+            viz_type="line",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="line",
+                since="1960-01-01",
+                metrics=["sum__SP_POP_TOTL"],
+                num_period_compare="10",
+                groupby=["country_name"],
+            ),
+        ),
+        Slice(
+            slice_name="% Rural",
+            viz_type="world_map",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="world_map",
+                metric="sum__SP_RUR_TOTL_ZS",
+                num_period_compare="10",
+                secondary_metric=secondary_metric,
+            ),
+        ),
+        Slice(
+            slice_name="Life Expectancy VS Rural %",
+            viz_type="bubble",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="bubble",
+                since="2011-01-01",
+                until="2011-01-02",
+                series="region",
+                limit=0,
+                entity="country_name",
+                x="sum__SP_RUR_TOTL_ZS",
+                y="sum__SP_DYN_LE00_IN",
+                size="sum__SP_POP_TOTL",
+                max_bubble_size="50",
+                adhoc_filters=[
+                    {
+                        "clause": "WHERE",
+                        "expressionType": "SIMPLE",
+                        "filterOptionName": "2745eae5",
+                        "comparator": [
+                            "TCA",
+                            "MNP",
+                            "DMA",
+                            "MHL",
+                            "MCO",
+                            "SXM",
+                            "CYM",
+                            "TUV",
+                            "IMY",
+                            "KNA",
+                            "ASM",
+                            "ADO",
+                            "AMA",
+                            "PLW",
+                        ],
+                        "operator": "NOT IN",
+                        "subject": "country_code",
+                    }
+                ],
+            ),
+        ),
+        Slice(
+            slice_name="Rural Breakdown",
+            viz_type="sunburst",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                viz_type="sunburst",
+                groupby=["region", "country_name"],
+                since="2011-01-01",
+                until="2011-01-01",
+                metric=metric,
+                secondary_metric=secondary_metric,
+            ),
+        ),
+        Slice(
+            slice_name="World's Pop Growth",
+            viz_type="area",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="1960-01-01",
+                until="now",
+                viz_type="area",
+                groupby=["region"],
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Box plot",
+            viz_type="box_plot",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="1960-01-01",
+                until="now",
+                whisker_options="Min/max (no outliers)",
+                x_ticks_layout="staggered",
+                viz_type="box_plot",
+                groupby=["region"],
+                metrics=metrics,
+            ),
+        ),
+        Slice(
+            slice_name="Treemap",
+            viz_type="treemap",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="1960-01-01",
+                until="now",
+                viz_type="treemap",
+                metrics=["sum__SP_POP_TOTL"],
+                groupby=["region", "country_code"],
+            ),
+        ),
+        Slice(
+            slice_name="Parallel Coordinates",
+            viz_type="para",
+            datasource_type="table",
+            datasource_id=tbl.id,
+            params=get_slice_json(
+                defaults,
+                since="2011-01-01",
+                until="2011-01-01",
+                viz_type="para",
+                limit=100,
+                metrics=["sum__SP_POP_TOTL", "sum__SP_RUR_TOTL_ZS", "sum__SH_DYN_AIDS"],
+                secondary_metric="sum__SP_POP_TOTL",
+                series="country_name",
+            ),
+        ),
+    ]
+    misc_dash_slices.add(slices[-1].slice_name)
+    for slc in slices:
+        merge_slice(slc)
+
+    print("Creating a World's Health Bank dashboard")
+    dash_name = "World Bank's Data"
+    slug = "world_health"
+    dash = db.session.query(Dashboard).filter_by(slug=slug).first()
+
+    if not dash:
+        dash = Dashboard()
+    dash.published = True
+    js = textwrap.dedent(
+        """\
+{
+    "CHART-36bfc934": {
+        "children": [],
+        "id": "CHART-36bfc934",
+        "meta": {
+            "chartId": 40,
+            "height": 25,
+            "sliceName": "Region Filter",
+            "width": 2
+        },
+        "type": "CHART"
+    },
+    "CHART-37982887": {
+        "children": [],
+        "id": "CHART-37982887",
+        "meta": {
+            "chartId": 41,
+            "height": 25,
+            "sliceName": "World's Population",
+            "width": 2
+        },
+        "type": "CHART"
+    },
+    "CHART-17e0f8d8": {
+        "children": [],
+        "id": "CHART-17e0f8d8",
+        "meta": {
+            "chartId": 42,
+            "height": 92,
+            "sliceName": "Most Populated Countries",
+            "width": 3
+        },
+        "type": "CHART"
+    },
+    "CHART-2ee52f30": {
+        "children": [],
+        "id": "CHART-2ee52f30",
+        "meta": {
+            "chartId": 43,
+            "height": 38,
+            "sliceName": "Growth Rate",
+            "width": 6
+        },
+        "type": "CHART"
+    },
+    "CHART-2d5b6871": {
+        "children": [],
+        "id": "CHART-2d5b6871",
+        "meta": {
+            "chartId": 44,
+            "height": 52,
+            "sliceName": "% Rural",
+            "width": 7
+        },
+        "type": "CHART"
+    },
+    "CHART-0fd0d252": {
+        "children": [],
+        "id": "CHART-0fd0d252",
+        "meta": {
+            "chartId": 45,
+            "height": 50,
+            "sliceName": "Life Expectancy VS Rural %",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "CHART-97f4cb48": {
+        "children": [],
+        "id": "CHART-97f4cb48",
+        "meta": {
+            "chartId": 46,
+            "height": 38,
+            "sliceName": "Rural Breakdown",
+            "width": 3
+        },
+        "type": "CHART"
+    },
+    "CHART-b5e05d6f": {
+        "children": [],
+        "id": "CHART-b5e05d6f",
+        "meta": {
+            "chartId": 47,
+            "height": 50,
+            "sliceName": "World's Pop Growth",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-e76e9f5f": {
+        "children": [],
+        "id": "CHART-e76e9f5f",
+        "meta": {
+            "chartId": 48,
+            "height": 50,
+            "sliceName": "Box plot",
+            "width": 4
+        },
+        "type": "CHART"
+    },
+    "CHART-a4808bba": {
+        "children": [],
+        "id": "CHART-a4808bba",
+        "meta": {
+            "chartId": 49,
+            "height": 50,
+            "sliceName": "Treemap",
+            "width": 8
+        },
+        "type": "CHART"
+    },
+    "COLUMN-071bbbad": {
+        "children": [
+            "ROW-1e064e3c",
+            "ROW-afdefba9"
+        ],
+        "id": "COLUMN-071bbbad",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 9
+        },
+        "type": "COLUMN"
+    },
+    "COLUMN-fe3914b8": {
+        "children": [
+            "CHART-36bfc934",
+            "CHART-37982887"
+        ],
+        "id": "COLUMN-fe3914b8",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT",
+            "width": 2
+        },
+        "type": "COLUMN"
+    },
+    "GRID_ID": {
+        "children": [
+            "ROW-46632bc2",
+            "ROW-3fa26c5d",
+            "ROW-812b3f13"
+        ],
+        "id": "GRID_ID",
+        "type": "GRID"
+    },
+    "HEADER_ID": {
+        "id": "HEADER_ID",
+        "meta": {
+            "text": "World's Bank Data"
+        },
+        "type": "HEADER"
+    },
+    "ROOT_ID": {
+        "children": [
+            "GRID_ID"
+        ],
+        "id": "ROOT_ID",
+        "type": "ROOT"
+    },
+    "ROW-1e064e3c": {
+        "children": [
+            "COLUMN-fe3914b8",
+            "CHART-2d5b6871"
+        ],
+        "id": "ROW-1e064e3c",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-3fa26c5d": {
+        "children": [
+            "CHART-b5e05d6f",
+            "CHART-0fd0d252"
+        ],
+        "id": "ROW-3fa26c5d",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-46632bc2": {
+        "children": [
+            "COLUMN-071bbbad",
+            "CHART-17e0f8d8"
+        ],
+        "id": "ROW-46632bc2",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-812b3f13": {
+        "children": [
+            "CHART-a4808bba",
+            "CHART-e76e9f5f"
+        ],
+        "id": "ROW-812b3f13",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "ROW-afdefba9": {
+        "children": [
+            "CHART-2ee52f30",
+            "CHART-97f4cb48"
+        ],
+        "id": "ROW-afdefba9",
+        "meta": {
+            "background": "BACKGROUND_TRANSPARENT"
+        },
+        "type": "ROW"
+    },
+    "DASHBOARD_VERSION_KEY": "v2"
+}
+    """
+    )
+    pos = json.loads(js)
+    update_slice_ids(pos, slices)
+
+    dash.dashboard_title = dash_name
+    dash.position_json = json.dumps(pos, indent=4)
+    dash.slug = slug
+
+    dash.slices = slices[:-1]
+    db.session.merge(dash)
+    db.session.commit()

+ 580 - 0
data/purposeCombined/BI/income_disparity_final_version_2.py

@@ -0,0 +1,580 @@
+# -*- coding: utf-8 -*-
+"""income_disparity.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1upuHuQ3gWDkpbvkvHl2uTQlSv20JZnf2
+"""
+
+
+#!pip install pandas-datareader
+import wbdata
+import datetime
+import numpy as np
+import pandas as pd
+from pandas_datareader import wb
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LinearRegression as lr
+from matplotlib.pyplot import MultipleLocator
+
+
+# =============================================================================
+# # Part 1: API Integration
+# =============================================================================
+
+# =============================================================================
+# # API method 1: using wbdata module
+# =============================================================================
+
+# #searching for countries index using names
+# print(wbdata.search_countries('United Kingdom'))
+
+# list of countries
+countries = ["USA", "BEL", "BRA", "COL", "FRA", "DEU", "GRC", "IDN", "IRL", "MEX", "NLD", "RUS"]
+# date period
+dates = datetime.datetime(2008, 1, 1), datetime.datetime(2018, 1, 1)
+
+# data object
+indicators = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
+             'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
+             'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}
+
+# getting data from these countries
+raw_data = wbdata.get_dataframe(indicators, country=countries, data_date=dates, convert_date=True)
+
+raw_unstacked_data = raw_data.unstack(level=0)
+
+# printing our data object
+# print(raw_data)
+# print(raw_unstacked_data)
+
+# =============================================================================
+# # API method 2: using from pandas.datareader import wb, convert the data object to a DataFrame 
+# =============================================================================
+
+# view all data
+pd.set_option('display.max_columns', 15) 
+pd.set_option('display.max_rows', 15) 
+
+df1 = wb.download(indicator = indicators, country = countries,  start = 2008, end = 2018)
+date_period = [i for i in range(2008, 2019)]
+print(df1)
+
+# create a new DataFrame df2 for later use, not change origin values from df1 if we do some calculations for our dataframe df2
+# rename the columns name
+df2 = df1.rename(columns = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
+             'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
+             'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}, inplace = False)
+
+# overview our data object DataFrame
+# Data manipulation: dealing with the missing value, replace them as mean(), which has less impact on our data sets
+df2.mean()
+df2.fillna(df2.mean(), inplace = True)
+print(df2)
+
+# Overview our new edited DataFram and get basic info of statistics
+print(df2.describe())
+
+
+
+
+# =============================================================================
+# # Part 2: Data structure set up
+# =============================================================================
+
+# =============================================================================
+# # creating our Data Structure type I
+# =============================================================================
+
+# step I: convert DataFrame to a list in correct order from 2008 to 2018
+def country_DataFrame_to_list(country, target_data):
+  df = wb.download(indicator = target_data, country = country,  start = 2008, end = 2018)
+  df.fillna(df.mean(), inplace = True)
+  df_list =df[df.columns[0]].tolist()
+  round_list = [round(i, 2) for i in df_list ]
+  return round_list[::-1]
+
+# step II: make a list of tuple, which is a good way to save our data
+def country_tuples(country_list, time):
+  return list(zip(country_list, time))
+
+# additional gap calculation for calculating the gap between two list
+def gap_between(toplist, lowlist):
+  gap_list = []
+  for i in range(len(toplist)):
+    gap_list.append(round((toplist[i]- lowlist[i]), 2))
+  return gap_list
+
+
+
+# step IV: Make a dictionary of list of tuple, which is one of our data structure of this project,
+# named as Data Structure type I.
+def object_Dictionary(country_list, object_target, date_period):
+  object_df = {}
+  for country in country_list:
+    object_df[country] = country_tuples(date_period, country_DataFrame_to_list(country, object_target))
+  return object_df
+
+# step V: start to build: 
+    
+    
+# This data set is for storing data of Income share held by highest 20%
+Top_20_df = object_Dictionary(countries, 'SI.DST.05TH.20', date_period)
+
+# This data set is for storing data of Income share held by lowest 20%
+Low_20_df = object_Dictionary(countries, 'SI.DST.FRST.20', date_period)
+
+# This data set is for storing data of 'Employment to population ratio, 15+, female (%) (national estimate)'
+female_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.FE.NE.ZS', date_period)
+
+# This data set is for storing data of 'Employment to population ratio, 15+, male (%) (national estimate)'
+male_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.MA.NE.ZS', date_period)
+
+
+
+
+# =============================================================================
+# # creating our Data Structure type II: convert our Data structure type I to typle II
+# =============================================================================
+# step 1: write a function that can unpack dictionary of tuple to a new dictionary of simple list, and calculate the gap
+def no_tuple_dic(object_Dictionary1, object_Dictionary2):
+  new_dict = {}
+  for i in countries:
+    new_list = []
+    for j in range(11):
+      # The reason why I didn't use the difference function is because I don't want my new dictionary has year
+      new_list.append(round((object_Dictionary1[i][j][1]- object_Dictionary2[i][j][1]), 2)) 
+    new_dict[i] = new_list  
+
+  return new_dict
+
+# step 2: getting the income gap dictionary of list between income share held by highest 20% and income share held by lowest 20%
+income_gap_dict = no_tuple_dic(Top_20_df, Low_20_df)
+
+# step 3: create our Data structure type II, DataFrame
+income_gap_dict_df = pd.DataFrame(income_gap_dict, columns = countries)
+
+# step 4: show the basic statistic info of our income gap DataFrame
+print(round(income_gap_dict_df.describe(),2))
+
+# same step as above, to get our Data Structure type II, between male employment population and female employment population
+gender_gap_dict = no_tuple_dic(male_employ_df, female_employ_df)
+
+gender_gap_dict_df = pd.DataFrame(gender_gap_dict, columns = countries)
+print(round(gender_gap_dict_df.describe(),2))
+
+
+
+# Data Structure function application
+
+# This function is to calculate the difference of the gap between income share held by highest 20% and income share held by lowest 20%
+def gap_income_Dataframe(country):
+  gap = {}
+  for i in range(len(Top_20_df[country])):
+    year1, data1 = Top_20_df[country][i]
+    year2, data2 = Low_20_df[country][i]  
+    if year1 == year2:
+      gap[year1] = round(data1-data2, 2)
+  return gap
+
+# This function is to calculate the difference of the gap between male employment population and female employment population
+def gap_gender_Dataframe(country):
+  gap = {}
+  for i in range(len(Top_20_df[country])):
+    year1, data1 = male_employ_df[country][i]
+    year2, data2 = female_employ_df[country][i]  
+    if year1 == year2:
+      gap[year1] = round(data1-data2, 2)
+  return gap
+
+# This function is to searching specific country and year  
+def searching_data(object_Dictionary, country, year):
+  country_list = []
+  if country in countries:
+    for i in range(11):
+      country_list.append(object_Dictionary[country][i])
+  
+  output = [item for item in country_list if item[0] == year]
+  #return empty list if data not found, return a tuple if country and year is valid    
+  return output
+
+
+
+
+
+# =============================================================================
+# # Part 3: Ploting the data set
+# =============================================================================
+
+
+# =============================================================================
+# #plot 1: Income gap from 2008 to 2018
+# =============================================================================
+
+from matplotlib.pyplot import MultipleLocator
+plt.title('Income gap from 2008 to 2018')
+plt.xlabel('Year')
+plt.ylabel('Income gap%')
+all_data_i = []
+
+for c in countries:
+  gap_i = gap_income_Dataframe(c)
+  x_i = gap_i.keys()
+  y_i = gap_i.values()
+  all_data_i.append(gap_i)
+  plt.scatter(x_i,y_i,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 2
+plt.xlim(2007,2019)   #Set the x scale range of the x-axis from 2008 to 2018, the reason why I use 2019 is because we can see clearly t 
+plt.ylim(25,60)     #Set the y scale range of the y-axis from 25 to 60
+
+N = 10000
+xr_i = list(range(2008,2019))
+yr_i = []
+for i in xr_i:
+  temp = 0
+  for j in all_data_i:
+    temp += j[i]
+  temp /= len(countries)
+  yr_i.append(temp)
+plt.plot(xr_i,yr_i,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.savefig('Income gap.pdf')  
+plt.show()
+
+# =============================================================================
+# #plot 2: Gender Employment rate gap from 2008 to 2018
+# =============================================================================
+
+plt.title('Gender Employment rate gap from 2008 to 2018')
+plt.xlabel('Year')
+plt.ylabel('Gender Employment Gap %')
+all_data_j = []
+for c in countries:
+  gap_j = gap_gender_Dataframe(c)
+  x_j = gap_j.keys()
+  y_j = gap_j.values()
+  all_data_j.append(gap_j)
+  plt.scatter(x_j,y_j,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 0.02
+plt.xlim(2007,2019)   #Set the scale range of the x-axis from 2008 to 2018
+plt.ylim(6,38)     #Set the scale range of the y-axis from 25 to 60
+
+N = 10000
+xr_j = list(range(2008,2019))
+yr_j = []
+for i in xr_j:
+  temp = 0
+  for j in all_data_j:
+    temp += j[i]
+  temp /= len(countries)
+  yr_j.append(temp)
+plt.plot(xr_j,yr_j,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.show()
+
+# =============================================================================
+# #boxplot 1 income gap
+# =============================================================================
+
+plt.figure(figsize=(9,6),dpi=60)
+
+labels, data = [*zip(*income_gap_dict.items())]  # 'transpose' items to parallel key, value lists
+
+# or backwards compatable    
+labels, data = income_gap_dict.keys(), income_gap_dict.values()
+plt.title('Income Gap from 2008 to 2018')
+plt.xlabel('Country')
+plt.ylabel('Income Gap %')
+plt.boxplot(data)
+plt.xticks(range(1, len(labels) + 1), labels)
+plt.show()
+
+# =============================================================================
+# #boxplot 2 gender employment gap
+# =============================================================================
+
+plt.figure(figsize=(9,6),dpi=60)
+
+labels, data = [*zip(*gender_gap_dict.items())]  # 'transpose' items to parallel key, value lists
+
+# or backwards compatable    
+labels, data = gender_gap_dict.keys(), gender_gap_dict.values()
+plt.title('Gender Employment Gap')
+plt.xlabel('Country')
+plt.ylabel('Gender Employment Gap %')
+plt.boxplot(data)
+plt.xticks(range(1, len(labels) + 1), labels)
+plt.show()
+
+# =============================================================================
+# #Part 4: linear regression
+# =============================================================================
+
+import numpy as np
+from sklearn.linear_model import LinearRegression
+
+# Convert the original data frame to list
+def convert_to_target_data_dict(country_list):
+    converted_dict = {}
+
+    for i in range(len(country_list)):
+        country_name = country_list[i]
+        converted_dict[country_name] = {}
+        gap_income_dict = gap_income_Dataframe(country_name)
+        gap_gender_dict = gap_gender_Dataframe(country_name)
+        converted_gap_income_list = []
+        converted_gap_gender_list = []
+
+        for k in gap_income_dict:
+            converted_gap_income_list.append(gap_income_dict[k])
+            converted_gap_gender_list.append(gap_gender_dict[k])
+
+        converted_dict[country_name]["income"] = converted_gap_income_list
+        converted_dict[country_name]["gender"] = converted_gap_gender_list
+
+    return converted_dict
+
+
+# Work out the x-coordinates for linear regression
+def x_coordinate():
+    x_list = []
+    x_coordinate = 2008
+    for i in range(11):
+        x_list.append(x_coordinate)
+        x_coordinate = x_coordinate + 1
+
+    return x_list
+
+
+# Work out the linear regression for single country
+def linear_regression(contry_name, coordinate_dict, data_type, predict_time):
+    y_list = coordinate_dict[contry_name][data_type]
+    x_list = x_coordinate()
+    x = np.array(x_list).reshape((-1, 1))
+    y = np.array(y_list)
+
+    linear_model = LinearRegression().fit(x, y)
+
+    predict_year = np.array([predict_time]).reshape((-1, 1))
+    ten_year_prediction = linear_model.predict(predict_year)
+    
+
+    return ten_year_prediction[0]
+
+
+# Work out the final predicted result for the income and gender gap of 2030
+def total_linear_regression_result(y_coordinate_dict):
+    linear_regression_result_dict = {}
+
+    for k in y_coordinate_dict:
+        linear_regression_result_dict[k] = {}
+        predict_income_gap_2030 = linear_regression(k, y_coordinate_dict, "income", 2030)
+        predict_gender_gap_2030 = linear_regression(k, y_coordinate_dict, "gender", 2030)
+        linear_regression_result_dict[k]["income"] = predict_income_gap_2030
+        linear_regression_result_dict[k]["gender"] = predict_gender_gap_2030
+
+    return linear_regression_result_dict
+
+
+# Calculate the average income & gender gap of 2030
+def calculate_average_gap(result_dict, country_list):
+    average_result_dict = {}
+    sum_income_gap = 0
+    sum_gender_gap = 0
+
+    for k in result_dict:
+        sum_income_gap = sum_income_gap + result_dict[k]["income"]
+        sum_gender_gap = sum_gender_gap + result_dict[k]["gender"]
+
+    average_income_gap = sum_income_gap / len(country_list)
+    average_gender_gap = sum_gender_gap / len(country_list)
+
+    average_result_dict["average_income_gap"] = average_income_gap
+    average_result_dict["average_gender_gap"] = average_gender_gap
+
+    return average_result_dict
+
+
+# Compare the average value with our liner regression result
+# print the list of countries which higher or lower than our average prediction, or even equal
+def compare_with_the_average(average_dict, result_dict):
+    compare_result_dict = {}
+    higher_than_income_average = []
+    lower_than_income_average = []
+    equal_to_income_average = []
+    higher_than_gender_average = []
+    lower_than_gender_average = []
+    equal_to_gender_average = []
+
+    for k in result_dict:
+        if result_dict[k]["income"] > average_dict["average_income_gap"]:
+            higher_than_income_average.append(k)
+        elif result_dict[k]["income"] < average_dict["average_income_gap"]:
+            lower_than_income_average.append(k)
+        elif result_dict[k]["income"] == average_dict["average_income_gap"]:
+            equal_to_income_average.append(k)
+
+        if result_dict[k]["gender"] > average_dict["average_gender_gap"]:
+            higher_than_gender_average.append(k)
+        elif result_dict[k]["gender"] < average_dict["average_gender_gap"]:
+            lower_than_gender_average.append(k)
+        elif result_dict[k]["gender"] == average_dict["average_gender_gap"]:
+            equal_to_gender_average.append(k)
+
+    compare_result_dict["higher_than_income_average"] = higher_than_income_average
+    compare_result_dict["lower_than_income_average"] = lower_than_income_average
+    compare_result_dict["equal_to_income_average"] = equal_to_income_average
+
+    compare_result_dict["higher_than_gender_average"] = higher_than_gender_average
+    compare_result_dict["lower_than_gender_average"] = lower_than_gender_average
+    compare_result_dict["equal_to_gender_average"] = equal_to_gender_average
+
+    return compare_result_dict
+
+
+def main():
+    # Work out the linear regression result for the 'countries' list
+    y_dict = convert_to_target_data_dict(countries)
+    linear_regression_result_dict = total_linear_regression_result(y_dict)
+
+    # Work out the average income & gender gap
+    average_gap_result = calculate_average_gap(linear_regression_result_dict, countries)
+
+    # Compare the average gap with the gap for each country
+    compare_with_average = compare_with_the_average(average_gap_result, linear_regression_result_dict)
+
+    # Print the results
+    print(linear_regression_result_dict)
+    print()
+    print(average_gap_result)
+    print()
+    print(compare_with_average)
+    return linear_regression_result_dict,average_gap_result,compare_with_average
+
+
+if __name__ == "__main__":
+    linear_regression_result_dict,average_gap_result,compare_with_average = main()
+
+
+# over view our linear regression result
+print()
+print(linear_regression_result_dict)
+
+
+# =============================================================================
+# #part 5: plot the figure with our prediction with comparison
+# =============================================================================
+
+# Commented out IPython magic to ensure Python compatibility.
+# =============================================================================
+# #plot 1 for income gap with prediction in 2030
+# =============================================================================
+# %matplotlib inline
+from matplotlib.pyplot import MultipleLocator
+plt.figure(figsize=(12,6),dpi=60)
+plt.title('Prediction of Income Gap in 2030')
+plt.xlabel('Year')
+plt.ylabel('Income gap%')
+all_data_i = []
+
+xr_x = list(range(2008,2019))
+xr_x.append(2030)
+# xr_x = list(map(lambda x:str(x),xr_x))
+for c in countries:
+  gap_i = gap_income_Dataframe(c)
+  x_i = list(gap_i.keys())
+  y_i = list(gap_i.values())
+  tmp = linear_regression_result_dict[c]
+  x_i.append(2019)
+  y_i.append(tmp["income"])
+  gap_i[2019] = tmp["income"]
+  all_data_i.append(gap_i)
+  plt.scatter(xr_x,y_i,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 2
+plt.xlim(2007,2031)   #Set the scale range of the x-axis from 2008 to 2018
+plt.ylim(25,60)     #Set the scale range of the y-axis from 25 to 60
+
+
+xr_i = list(range(2008,2019))
+xr_i.append(2019)
+yr_i = []
+for i in xr_i:
+  temp = 0
+  for j in all_data_i:
+    temp += j[i]
+  temp /= len(countries)
+  yr_i.append(temp)
+
+plt.plot(xr_x,yr_i,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.savefig('Income gap.pdf')  
+plt.show()
+
+
+
+
+# =============================================================================
+# #plot 2 for gender gap with prediction in 2030
+# =============================================================================
+plt.figure(figsize=(12,6),dpi=60)
+plt.title('Prediction of Gender Employment Gap in 2030')
+plt.xlabel('Year')
+plt.ylabel('Gender Employment Gap %')
+all_data_j = []
+
+xr_x = list(range(2008,2019))
+xr_x.append(2030)
+for c in countries:
+  gap_j = gap_gender_Dataframe(c)
+  x_j = list(gap_j.keys())
+  y_j = list(gap_j.values())
+  tmp = linear_regression_result_dict[c]
+  x_j.append(2019)
+  y_j.append(tmp["gender"])
+  gap_j[2019] = tmp["gender"]
+  all_data_j.append(gap_j)
+  plt.scatter(xr_x,y_j,marker='+',label=c)
+  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+
+x_major_locator=MultipleLocator(1)  #set the x interval as 1
+y_major_locator=MultipleLocator(2)   #set the y interval as 2
+ax=plt.gca()
+ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
+ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 0.02
+plt.xlim(2007,2031)   #Set the scale range of the x-axis from 2008 to 2018
+plt.ylim(2,38)     #Set the scale range of the y-axis from 25 to 60
+
+
+xr_j = list(range(2008,2019))
+xr_j.append(2019)
+yr_j = []
+for i in xr_j:
+  temp = 0
+  for j in all_data_j:
+    temp += j[i]
+  temp /= len(countries)
+  yr_j.append(temp)
+plt.plot(xr_x,yr_j,"r-",label='average')
+plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
+plt.show()
+
+

+ 338 - 0
data/purposeCombined/BI/macro_analysis-backup.py

@@ -0,0 +1,338 @@
+import pandas as pd
+from bokeh.plotting import figure, save, show,output_file, ColumnDataSource
+from bokeh.models import HoverTool
+import matplotlib.pyplot as plt
+
+class DataFrameAnalysis:
+    """Arms Macro-Analysis capability to a dataframe"""
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+
+    def avg_discount_rate(self):
+        """Calculates average discount rate of all orders."""
+        # You should calculate the average and gross discount rate.
+        pd.to_numeric(self.df['Discount_Amount'])
+        pd.to_numeric(self.df['Order_Total_Amount'])
+        total_sales_amount = self.df['Order_Total_Amount'].sum()
+        total_discount_amount = self.df['Discount_Amount'].sum()
+        total_discount_avg = int((total_discount_amount / (total_discount_amount+total_sales_amount))*100)
+        return print(f'Customer Discount Avg: {total_discount_avg}%')
+
+
+    def customer_role_breakdown(self):
+        """Calculates proportion of retail/wholesale as a function of sales."""
+        retail = 0
+        wholesale = 0
+        sum_count =int(len(self.df.index))
+        sum_sales = self.df['Order_Total_Amount'].sum()
+        retail_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Customer'].index)/sum_count)*100)
+        wholesale_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Wholesale Customer'].index)/sum_count)*100)
+        retail_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Customer'].sum()/sum_sales)*100)
+        wholesale_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Wholesale Customer'].sum()/sum_sales)*100)
+        grid = [[retail_customer_count,wholesale_customer_count],[retail_sales,wholesale_sales]]
+        crb_df = pd.DataFrame(data=grid, columns=['Retail','Wholesale'], index=['Proportional Order Counts', 'Proportional Sales'])
+        plt.style.use('seaborn-deep')
+        fig, ax = plt.subplots(figsize=(10, 10))
+        crb_df.plot.bar(title='Customer Role Breakdown', xlabel='Customer Role', ylabel='Proportion (%)',
+                        cmap='winter', ax=ax)
+        plt.figsave('Customer_Role_Breakdown.png')
+        print(crb_df.head(3))
+
+    def geographical_breakdown(self):
+        """ Displays a scatterplot of Sales/Revenue weights for different States."""
+        self.df = self.df[self.df.Country_Name_Shipping== 'United States (US)']
+        counts = self.df["State_Name_Shipping"].value_counts().to_dict()
+        States = list(counts.keys())
+        Count = list(counts.values())
+        geo = pd.DataFrame({'States': States, 'Counts': Count})
+        geo_dataframe = pd.DataFrame(geo)
+        geo_dataframe.insert(loc=2, column="Sales_Total", value=0)
+        geo_dataframe.insert(loc=3, column="Avg_Purchase_Revenue", value=0)
+        for i, row in self.df.iterrows():
+            state = row.loc['State_Name_Shipping']
+            total = row.loc['Order_Total_Amount']
+            idx = geo_dataframe[geo_dataframe["States"] == state].index.item()
+            av = int(geo_dataframe.at[idx, 'Sales_Total']) / int(geo_dataframe.at[idx, 'Counts'])
+            geo_dataframe.at[idx, 'Sales_Total'] += total
+            geo_dataframe.at[idx, 'Avg_Purchase_Revenue'] = av
+        # data visualization
+        cds = ColumnDataSource(geo_dataframe)
+        cds.data.keys()
+        visual = figure(tools='box_zoom, pan, reset',
+                        width=700, height=700,
+                        title='Geographical Sales Breakdown',
+                        y_axis_label='Order Quantity', x_axis_label='Revenue')
+        visual.circle('Sales_Total', 'Counts', size=7, source=cds, name= 'States')
+        visual.add_tools(HoverTool(tooltips=[("State", "@States"),
+                                             ("Average Purchase Revenue", "@Avg_Purchase_Revenue")
+                                             ]))
+        output_file('geographical_breakdown.html')
+        save(visual)
+        show(visual)
+        return print(geo_dataframe)
+
+
+class ProductAnalysis:
+    """Arms product analysis capability to a dataframe"""
+
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+        self.analysis_frame = self.monthly_product_frame()
+        self.time_span = self.serve_time_span()  # list of tuples: x[0] == year, x[1] == month for x in self.time_span
+
+    def monthly_product_frame(self):
+        """Analyzes the order lines in the CSV_Files folder and
+        Returns a pandas Dataframe with monthly product statistics."""
+        from datetime import datetime
+        import information_repository as ir
+        frame = self.df
+        frame = frame[['Order_Date', 'Product_Name', 'Quantity', 'Item_Cost']]
+        dict_list = []
+        for i, row in frame.iterrows():
+            row_date = row['Order_Date']
+            row_date = datetime.strptime(row_date, "%Y-%m-%d %H:%M")
+            row_date_month = row_date.month
+            row_date_year = row_date.year
+            raw_products = row['Product_Name'].replace('\r', '').split('\n')
+            raw_quantities = row['Quantity'].replace('\r', '').split('\n')
+            raw_cost = row['Item_Cost'].replace('\r', '').split('\n')
+            for key in range(len(raw_products)):
+                product = [i for i in ir.p_list if i in raw_products[key]][0]
+                quantity = int(raw_quantities[key])
+                revenue = float(raw_cost[key])
+                dict_object = [product, quantity, revenue, row_date_month, row_date_year]
+                matched_dictionary = [i for i in dict_list if
+                                      i['name'] == dict_object[0] and i['month'] == dict_object[3]
+                                      and i['year'] == dict_object[4]]
+                if len(matched_dictionary) == 1:
+                    matched_dictionary[0]['count'] += dict_object[1]
+                    matched_dictionary[0]['revenue'] += dict_object[2]
+                else:
+                    dict_list.append({'name': dict_object[0], 'count': dict_object[1],
+                                      'revenue': dict_object[2], 'month': dict_object[3], 'year': dict_object[4]})
+        self.analysis_frame = pd.DataFrame(columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+        time_span = []
+        for product in ir.p_list:
+            product_dictionaries = sorted(
+                sorted([i for i in dict_list if i['name'] == product], key= x['month']
+                       ), key= x['year'])
+            data_list = []
+            year_list = []
+            month_list = []
+            for key in range(len(product_dictionaries)):
+                if key > 0:
+                    try:
+                        change_over_month = (100 - round(
+                            ((product_dictionaries[key]['revenue'] / product_dictionaries[key]['count'])
+                             / (product_dictionaries[key - 1]['revenue'] / product_dictionaries[key - 1][
+                                        'count'])) * 100))
+
+                    except IndexError:
+                        print('change_list calls need to be refined')
+                else:
+                    change_over_month = 0
+
+                row_list = [product_dictionaries[key]['year'], product_dictionaries[key]['month'],
+                            product_dictionaries[key]['count'], product_dictionaries[key]['revenue'], change_over_month,
+                            product_dictionaries[key]['name']]
+                data_list.append(row_list)
+                if product == 'Blue Moon':
+                    month_list.append(product_dictionaries[key]['month'])
+                    year_list.append(product_dictionaries[key]['year'])
+
+            if product == 'Blue Moon':
+                time_span = [*zip(year_list, month_list)]
+            append_frame = pd.DataFrame(data=data_list,
+                                        columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+            self.analysis_frame = pd.concat([self.analysis_frame, append_frame], ignore_index=True)
+        self.time_span = time_span
+        return self.analysis_frame
+
+    def highest_positive_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level increased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=False)
+        return print(data_slice.head(5))
+
+    def highest_negative_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level decreased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=True)
+        return data_slice
+
+    def product_line_change_over_month_analysis(self, year, month):
+        """Analyzes the monthly_product_frame by product line and returns a dataframe with
+        product line change over month data."""
+        import information_repository as ir
+        #year = int(input('Type the year you would like to query in yyyy format:  '))
+        #month = int(input('Type the month you would like to query:  '))
+        product_line_list_of_lists = [ir.tea_product_list, ir.capsule_product_list, ir.smokeable_product_list,
+                             ir.skincare_product_list, ir.superfood_product_list, ir.honey_product_list,
+                             ir.tincture_product_list]
+        product_line_strings = ['Tea', 'Capsules', 'Smokeables', 'Skincare', 'Superfood', 'Honey', 'Tinctures']
+        product_line_append_list = []
+        line_index_counter = 0
+        for product_line in product_line_list_of_lists:
+            line_list = []
+            line_list.append(year)
+            line_list.append(month)
+            data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[
+                self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            if month > 1:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == (month - 1)].loc[
+                    self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            else:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == 12].loc[
+                    self.analysis_frame['year'] == (year - 1)].loc[self.analysis_frame['product'].isin(product_line)]
+            last_month_revenue = last_month_frame['revenue'].sum()
+            this_month_revenue = data_slice['revenue'].sum()
+            avg_change_over_month = (this_month_revenue / last_month_revenue) * 100
+            line_list.append(avg_change_over_month)
+            product_line = product_line_strings[line_index_counter]
+            line_index_counter += 1
+            line_list.append(product_line)
+            product_line_append_list.append(line_list)
+        product_line_analysis_frame = pd.DataFrame(data=product_line_append_list,
+                                                   columns=['year', 'month', 'avg_change_over_month',
+                                                            'product_line'])
+        product_line_analysis_frame.to_csv('product_line_csv_2021.csv')
+        return product_line_analysis_frame
+
+    def serve_time_span(self):
+        """Returns a list of tuples of unique (year, month) pairs in chronological order based on the
+         monthly_product_frame."""
+        return sorted(sorted(list(set([*zip(self.analysis_frame['year'],self.analysis_frame['month'])])),
+                            key=x[1]), key=x[0])
+
+    def product_line_change_over_month_graph(self):
+        """Using the product_line_change_over_month_analysis frame, it outputs a graph of the changes over time for
+        the top product lines."""
+        line_change_frame_data = []
+        for i in self.time_span:
+            month_frame = self.product_line_change_over_month_analysis(i[0], i[1])
+            change_list = month_frame['avg_change_over_month']
+            line_change_frame_data.append(change_list)
+        treated_line_change_frame_data = []
+        for i in range(len(line_change_frame_data)): #index of time period/segment
+            if i ==0:
+                treated_line_change_frame_data.append([self.time_span[i][0], self.time_span[i][1],
+                                                       0,0,0,0,0,0,0]) #insert base amounts for the first month
+            else: #function as intended
+                month_cumulative_change_list = []
+                month_cumulative_change_list.append(self.time_span[i][0])
+                month_cumulative_change_list.append(self.time_span[i][1])# append year and month
+                for x in range(len(line_change_frame_data[0])):
+                    prior_change_list = [i[x] for i in line_change_frame_data]
+                    product_cumulative_change = (100+treated_line_change_frame_data[i-1][x+2]) * ((prior_change_list[i]/100))-100
+                    #i-1 for previous time period and x+2 for offset due to year and month category
+                    month_cumulative_change_list.append(product_cumulative_change)
+                treated_line_change_frame_data.append(month_cumulative_change_list)
+        graph_frame = pd.DataFrame(data=treated_line_change_frame_data, columns=['Year', 'Month', 'Tea', 'Capsules', 'Smokeables','Skincare',
+                                                                           'Superfood', 'Honey', 'Tinctures'])
+        print(graph_frame.head(7))
+        x = [str(i) for i in graph_frame['Month']]
+        y1 = graph_frame['Tea']
+        y2 = graph_frame['Capsules']
+        y3 = graph_frame['Superfood']
+        y4 = graph_frame['Honey']
+        y5 = graph_frame['Smokeables']
+        graph = figure(x_range=x,title='Cumulative Percentage Change of Product Lines',x_axis_label='Month', y_axis_label='Percentage Change')
+        graph.line(x, y1, legend_label ='Tea', color='red', line_width=3)
+        graph.line(x, y2, legend_label ='Capsules', color='blue', line_width=3)
+        graph.line(x, y3, legend_label ='Superfood', color='orange', line_width=3)
+        graph.line(x, y4, legend_label ='Honey', color='yellow', line_width=3)
+        graph.line(x, y5, legend_label ='Smokeables', color='green', line_width=3)
+        output_file('product_line_change_over_month.html')
+        save(graph)
+        return show(graph)
+
+
+class InventoryPredictor:
+    """Inventory volume prediction using a product sales csv as the raw data."""
+    def __init__(self):
+        import information_repository as ir
+        self.unit_counts = self.sales_unit_count_dictionaries()
+        self.ingredients = self.ingredient_dictionary()
+        self.recipes = ir.unit_recipes
+
+        print('initiating')
+        pass
+
+    def sales_unit_count_dictionaries(self):
+        """Creates a set of dictionaries for each product and the cumulative quantity of units across all SKUs."""
+        import information_repository as ir
+        product_sales_frame = pd.read_csv('product_sales.csv')
+        product_sales_frame = product_sales_frame.where(pd.notnull(product_sales_frame), 'None')
+        product_unit_amounts = []
+        for i in ir.p_list:
+            product_dict = dict(name=i, quantity=0)
+            for x, row in product_sales_frame.iterrows():
+                if i in row['Product Name']:
+                    if i in ir.tea_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '20' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 20
+                        else:
+                            pass
+                            # print('Something unexpected occured', row['Product Name'], row['Variation Attributes'])
+                    elif i in ir.superfood_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '9' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        else:
+                            product_dict['quantity'] += 1
+                    elif i in ir.capsule_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        if '4' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.smokeable_product_list:
+                        if '7' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 7
+                        elif 'prerolls' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 2
+                        else:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.honey_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '5' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 5
+                        elif '2' in row['Variation Attributes']:
+                            pass
+                            # print('Reminder that packet honeys and jars need to separate')
+                    else:
+                        product_dict['quantity'] += row['Quantity Sold']
+            product_unit_amounts.append(product_dict)
+        return product_unit_amounts
+
+    def ingredient_dictionary(self):
+        """Creates a ingredient dictionary with all ingredients as keys and the cumulative volume across all
+        products as values."""
+        inventory = pd.read_csv('craftybase-export-material.csv')
+        ingredient_dictionary = {}
+        for i in list(inventory['name']):
+            ingredient_dictionary[i]=0
+        return ingredient_dictionary
+
+    def ingredient_volume_table(self):
+        """Creates a csv with ingredients and the cumulative volume used across a time span."""
+        for x in self.unit_counts:
+            for y in self.recipes:
+                if x['name'] == y['name']:
+                    for k, v in y.items():
+                        if k != 'name':
+                            self.ingredients[k] += v * x['quantity']
+        sorted_ingredient_volumes = sorted(self.ingredients.items(), key= x[1], reverse=True)
+        output_frame = pd.DataFrame(data = sorted_ingredient_volumes, columns= ['Ingredient', 'Volume (gram or oz)'])
+        output_frame = output_frame[output_frame['Volume (gram or oz)'] !=0]
+        output_frame.to_csv('ingredient_volume_table.csv')
+

+ 338 - 0
data/purposeCombined/BI/macro_analysis.py

@@ -0,0 +1,338 @@
+import pandas as pd
+from bokeh.plotting import figure, save, show,output_file, ColumnDataSource
+from bokeh.models import HoverTool
+import matplotlib.pyplot as plt
+
+class DataFrameAnalysis:
+    """Arms Macro-Analysis capability to a dataframe"""
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+
+    def avg_discount_rate(self):
+        """Calculates average discount rate of all orders."""
+        # You should calculate the average and gross discount rate.
+        pd.to_numeric(self.df['Discount_Amount'])
+        pd.to_numeric(self.df['Order_Total_Amount'])
+        total_sales_amount = self.df['Order_Total_Amount'].sum()
+        total_discount_amount = self.df['Discount_Amount'].sum()
+        total_discount_avg = int((total_discount_amount / (total_discount_amount+total_sales_amount))*100)
+        return print(f'Customer Discount Avg: {total_discount_avg}%')
+
+
+    def customer_role_breakdown(self):
+        """Calculates proportion of retail/wholesale as a function of sales."""
+        retail = 0
+        wholesale = 0
+        sum_count =int(len(self.df.index))
+        sum_sales = self.df['Order_Total_Amount'].sum()
+        retail_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Customer'].index)/sum_count)*100)
+        wholesale_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Wholesale Customer'].index)/sum_count)*100)
+        retail_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Customer'].sum()/sum_sales)*100)
+        wholesale_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Wholesale Customer'].sum()/sum_sales)*100)
+        grid = [[retail_customer_count,wholesale_customer_count],[retail_sales,wholesale_sales]]
+        crb_df = pd.DataFrame(data=grid, columns=['Retail','Wholesale'], index=['Proportional Order Counts', 'Proportional Sales'])
+        plt.style.use('seaborn-deep')
+        fig, ax = plt.subplots(figsize=(10, 10))
+        crb_df.plot.bar(title='Customer Role Breakdown', xlabel='Customer Role', ylabel='Proportion (%)',
+                        cmap='winter', ax=ax)
+        plt.figsave('Customer_Role_Breakdown.png')
+        print(crb_df.head(3))
+
+    def geographical_breakdown(self):
+        """ Displays a scatterplot of Sales/Revenue weights for different States."""
+        self.df = self.df[self.df.Country_Name_Shipping== 'United States (US)']
+        counts = self.df["State_Name_Shipping"].value_counts().to_dict()
+        States = list(counts.keys())
+        Count = list(counts.values())
+        geo = pd.DataFrame({'States': States, 'Counts': Count})
+        geo_dataframe = pd.DataFrame(geo)
+        geo_dataframe.insert(loc=2, column="Sales_Total", value=0)
+        geo_dataframe.insert(loc=3, column="Avg_Purchase_Revenue", value=0)
+        for i, row in self.df.iterrows():
+            state = row.loc['State_Name_Shipping']
+            total = row.loc['Order_Total_Amount']
+            idx = geo_dataframe[geo_dataframe["States"] == state].index.item()
+            av = int(geo_dataframe.at[idx, 'Sales_Total']) / int(geo_dataframe.at[idx, 'Counts'])
+            geo_dataframe.at[idx, 'Sales_Total'] += total
+            geo_dataframe.at[idx, 'Avg_Purchase_Revenue'] = av
+        # data visualization
+        cds = ColumnDataSource(geo_dataframe)
+        cds.data.keys()
+        visual = figure(tools='box_zoom, pan, reset',
+                        width=700, height=700,
+                        title='Geographical Sales Breakdown',
+                        y_axis_label='Order Quantity', x_axis_label='Revenue')
+        visual.circle('Sales_Total', 'Counts', size=7, source=cds, name= 'States')
+        visual.add_tools(HoverTool(tooltips=[("State", "@States"),
+                                             ("Average Purchase Revenue", "@Avg_Purchase_Revenue")
+                                             ]))
+        output_file('geographical_breakdown.html')
+        save(visual)
+        show(visual)
+        return print(geo_dataframe)
+
+
+class ProductAnalysis:
+    """Arms product analysis capability to a dataframe"""
+
+    def __init__(self, frame):
+        self.df = frame  # dataframe object
+        self.analysis_frame = self.monthly_product_frame()
+        self.time_span = self.serve_time_span()  # list of tuples: x[0] == year, x[1] == month for x in self.time_span
+
+    def monthly_product_frame(self):
+        """Analyzes the order lines in the CSV_Files folder and
+        Returns a pandas Dataframe with monthly product statistics."""
+        from datetime import datetime
+        import information_repository as ir
+        frame = self.df
+        frame = frame[['Order_Date', 'Product_Name', 'Quantity', 'Item_Cost']]
+        dict_list = []
+        for i, row in frame.iterrows():
+            row_date = row['Order_Date']
+            row_date = datetime.strptime(row_date, "%Y-%m-%d %H:%M")
+            row_date_month = row_date.month
+            row_date_year = row_date.year
+            raw_products = row['Product_Name'].replace('\r', '').split('\n')
+            raw_quantities = row['Quantity'].replace('\r', '').split('\n')
+            raw_cost = row['Item_Cost'].replace('\r', '').split('\n')
+            for key in range(len(raw_products)):
+                product = [i for i in ir.p_list if i in raw_products[key]][0]
+                quantity = int(raw_quantities[key])
+                revenue = float(raw_cost[key])
+                dict_object = [product, quantity, revenue, row_date_month, row_date_year]
+                matched_dictionary = [i for i in dict_list if
+                                      i['name'] == dict_object[0] and i['month'] == dict_object[3]
+                                      and i['year'] == dict_object[4]]
+                if len(matched_dictionary) == 1:
+                    matched_dictionary[0]['count'] += dict_object[1]
+                    matched_dictionary[0]['revenue'] += dict_object[2]
+                else:
+                    dict_list.append({'name': dict_object[0], 'count': dict_object[1],
+                                      'revenue': dict_object[2], 'month': dict_object[3], 'year': dict_object[4]})
+        self.analysis_frame = pd.DataFrame(columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+        time_span = []
+        for product in ir.p_list:
+            product_dictionaries = sorted(
+                sorted([i for i in dict_list if i['name'] == product], key=lambda x: x['month']
+                       ), key=lambda x: x['year'])
+            data_list = []
+            year_list = []
+            month_list = []
+            for key in range(len(product_dictionaries)):
+                if key > 0:
+                    try:
+                        change_over_month = (100 - round(
+                            ((product_dictionaries[key]['revenue'] / product_dictionaries[key]['count'])
+                             / (product_dictionaries[key - 1]['revenue'] / product_dictionaries[key - 1][
+                                        'count'])) * 100))
+
+                    except IndexError:
+                        print('change_list calls need to be refined')
+                else:
+                    change_over_month = 0
+
+                row_list = [product_dictionaries[key]['year'], product_dictionaries[key]['month'],
+                            product_dictionaries[key]['count'], product_dictionaries[key]['revenue'], change_over_month,
+                            product_dictionaries[key]['name']]
+                data_list.append(row_list)
+                if product == 'Blue Moon':
+                    month_list.append(product_dictionaries[key]['month'])
+                    year_list.append(product_dictionaries[key]['year'])
+
+            if product == 'Blue Moon':
+                time_span = [*zip(year_list, month_list)]
+            append_frame = pd.DataFrame(data=data_list,
+                                        columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
+            self.analysis_frame = pd.concat([self.analysis_frame, append_frame], ignore_index=True)
+        self.time_span = time_span
+        return self.analysis_frame
+
+    def highest_positive_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level increased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=False)
+        return print(data_slice.head(5))
+
+    def highest_negative_product_change_over_month_analysis(self):
+        """Analyzes the monthly_product_frame and returns the 5 products whose sales level decreased the most"""
+        year = int(input('Type the year you would like to query in yyyy format:  '))
+        month = int(input('Type the month you would like to query:  '))
+        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
+        data_slice.sort_values(by='change_over_month', inplace=True, ascending=True)
+        return data_slice
+
+    def product_line_change_over_month_analysis(self, year, month):
+        """Analyzes the monthly_product_frame by product line and returns a dataframe with
+        product line change over month data."""
+        import information_repository as ir
+        #year = int(input('Type the year you would like to query in yyyy format:  '))
+        #month = int(input('Type the month you would like to query:  '))
+        product_line_list_of_lists = [ir.tea_product_list, ir.capsule_product_list, ir.smokeable_product_list,
+                             ir.skincare_product_list, ir.superfood_product_list, ir.honey_product_list,
+                             ir.tincture_product_list]
+        product_line_strings = ['Tea', 'Capsules', 'Smokeables', 'Skincare', 'Superfood', 'Honey', 'Tinctures']
+        product_line_append_list = []
+        line_index_counter = 0
+        for product_line in product_line_list_of_lists:
+            line_list = []
+            line_list.append(year)
+            line_list.append(month)
+            data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[
+                self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            if month > 1:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == (month - 1)].loc[
+                    self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
+            else:
+                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == 12].loc[
+                    self.analysis_frame['year'] == (year - 1)].loc[self.analysis_frame['product'].isin(product_line)]
+            last_month_revenue = last_month_frame['revenue'].sum()
+            this_month_revenue = data_slice['revenue'].sum()
+            avg_change_over_month = (this_month_revenue / last_month_revenue) * 100
+            line_list.append(avg_change_over_month)
+            product_line = product_line_strings[line_index_counter]
+            line_index_counter += 1
+            line_list.append(product_line)
+            product_line_append_list.append(line_list)
+        product_line_analysis_frame = pd.DataFrame(data=product_line_append_list,
+                                                   columns=['year', 'month', 'avg_change_over_month',
+                                                            'product_line'])
+        product_line_analysis_frame.to_csv('product_line_csv_2021.csv')
+        return product_line_analysis_frame
+
+    def serve_time_span(self):
+        """Returns a list of tuples of unique (year, month) pairs in chronological order based on the
+         monthly_product_frame."""
+        return sorted(sorted(list(set([*zip(self.analysis_frame['year'],self.analysis_frame['month'])])),
+                            key=lambda x:x[1]), key=lambda x:x[0])
+
+    def product_line_change_over_month_graph(self):
+        """Using the product_line_change_over_month_analysis frame, it outputs a graph of the changes over time for
+        the top product lines."""
+        line_change_frame_data = []
+        for i in self.time_span:
+            month_frame = self.product_line_change_over_month_analysis(i[0], i[1])
+            change_list = month_frame['avg_change_over_month']
+            line_change_frame_data.append(change_list)
+        treated_line_change_frame_data = []
+        for i in range(len(line_change_frame_data)): #index of time period/segment
+            if i ==0:
+                treated_line_change_frame_data.append([self.time_span[i][0], self.time_span[i][1],
+                                                       0,0,0,0,0,0,0]) #insert base amounts for the first month
+            else: #function as intended
+                month_cumulative_change_list = []
+                month_cumulative_change_list.append(self.time_span[i][0])
+                month_cumulative_change_list.append(self.time_span[i][1])# append year and month
+                for x in range(len(line_change_frame_data[0])):
+                    prior_change_list = [i[x] for i in line_change_frame_data]
+                    product_cumulative_change = (100+treated_line_change_frame_data[i-1][x+2]) * ((prior_change_list[i]/100))-100
+                    #i-1 for previous time period and x+2 for offset due to year and month category
+                    month_cumulative_change_list.append(product_cumulative_change)
+                treated_line_change_frame_data.append(month_cumulative_change_list)
+        graph_frame = pd.DataFrame(data=treated_line_change_frame_data, columns=['Year', 'Month', 'Tea', 'Capsules', 'Smokeables','Skincare',
+                                                                           'Superfood', 'Honey', 'Tinctures'])
+        print(graph_frame.head(7))
+        x = [str(i) for i in graph_frame['Month']]
+        y1 = graph_frame['Tea']
+        y2 = graph_frame['Capsules']
+        y3 = graph_frame['Superfood']
+        y4 = graph_frame['Honey']
+        y5 = graph_frame['Smokeables']
+        graph = figure(x_range=x,title='Cumulative Percentage Change of Product Lines',x_axis_label='Month', y_axis_label='Percentage Change')
+        graph.line(x, y1, legend_label ='Tea', color='red', line_width=3)
+        graph.line(x, y2, legend_label ='Capsules', color='blue', line_width=3)
+        graph.line(x, y3, legend_label ='Superfood', color='orange', line_width=3)
+        graph.line(x, y4, legend_label ='Honey', color='yellow', line_width=3)
+        graph.line(x, y5, legend_label ='Smokeables', color='green', line_width=3)
+        output_file('product_line_change_over_month.html')
+        save(graph)
+        return show(graph)
+
+
+class InventoryPredictor:
+    """Inventory volume prediction using a product sales csv as the raw data."""
+    def __init__(self):
+        import information_repository as ir
+        self.unit_counts = self.sales_unit_count_dictionaries()
+        self.ingredients = self.ingredient_dictionary()
+        self.recipes = ir.unit_recipes
+
+        print('initiating')
+        pass
+
+    def sales_unit_count_dictionaries(self):
+        """Creates a set of dictionaries for each product and the cumulative quantity of units across all SKUs."""
+        import information_repository as ir
+        product_sales_frame = pd.read_csv('product_sales.csv')
+        product_sales_frame = product_sales_frame.where(pd.notnull(product_sales_frame), 'None')
+        product_unit_amounts = []
+        for i in ir.p_list:
+            product_dict = dict(name=i, quantity=0)
+            for x, row in product_sales_frame.iterrows():
+                if i in row['Product Name']:
+                    if i in ir.tea_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '20' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 20
+                        else:
+                            pass
+                            # print('Something unexpected occured', row['Product Name'], row['Variation Attributes'])
+                    elif i in ir.superfood_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        elif '9' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        else:
+                            product_dict['quantity'] += 1
+                    elif i in ir.capsule_product_list:
+                        if '1' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold']
+                        if '4' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.smokeable_product_list:
+                        if '7' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 7
+                        elif 'prerolls' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 2
+                        else:
+                            product_dict['quantity'] += row['Quantity Sold'] * 4
+                    elif i in ir.honey_product_list:
+                        if '3' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 3
+                        elif '5' in row['Variation Attributes']:
+                            product_dict['quantity'] += row['Quantity Sold'] * 5
+                        elif '2' in row['Variation Attributes']:
+                            pass
+                            # print('Reminder that packet honeys and jars need to separate')
+                    else:
+                        product_dict['quantity'] += row['Quantity Sold']
+            product_unit_amounts.append(product_dict)
+        return product_unit_amounts
+
+    def ingredient_dictionary(self):
+        """Creates a ingredient dictionary with all ingredients as keys and the cumulative volume across all
+        products as values."""
+        inventory = pd.read_csv('craftybase-export-material.csv')
+        ingredient_dictionary = {}
+        for i in list(inventory['name']):
+            ingredient_dictionary[i]=0
+        return ingredient_dictionary
+
+    def ingredient_volume_table(self):
+        """Creates a csv with ingredients and the cumulative volume used across a time span."""
+        for x in self.unit_counts:
+            for y in self.recipes:
+                if x['name'] == y['name']:
+                    for k, v in y.items():
+                        if k != 'name':
+                            self.ingredients[k] += v * x['quantity']
+        sorted_ingredient_volumes = sorted(self.ingredients.items(), key=lambda x: x[1], reverse=True)
+        output_frame = pd.DataFrame(data = sorted_ingredient_volumes, columns= ['Ingredient', 'Volume (gram or oz)'])
+        output_frame = output_frame[output_frame['Volume (gram or oz)'] !=0]
+        output_frame.to_csv('ingredient_volume_table.csv')
+

+ 662 - 0
data/purposeCombined/BI/practica3.py

@@ -0,0 +1,662 @@
+# -*- coding: utf-8 -*-
+"""
+Autor:
+    Francisco Solano López Rodríguez
+Fecha:
+    Noviembre/2018
+Contenido:
+    Práctica 3
+    Inteligencia de Negocio
+    Grado en Ingeniería Informática
+    Universidad de Granada
+"""
+
+''' -------------------- IMPORT LIBRARY -------------------- '''
+
+import pandas as pd
+import numpy as np
+import time
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import Counter
+
+import datetime
+
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.feature_selection import VarianceThreshold
+from sklearn import ensemble
+
+''' --- classifiers import --- '''
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn import svm
+import xgboost as xgb
+import lightgbm as lgb
+from sklearn import tree
+
+from sklearn.svm import SVC, LinearSVC, NuSVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+
+from catboost import Pool, CatBoostClassifier
+
+''' --- preprocessing import --- '''
+from sklearn import preprocessing
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler  
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.preprocessing import scale
+from sklearn.preprocessing import Normalizer
+
+''' --- metrics import --- '''
+from sklearn import metrics
+from sklearn.metrics import roc_curve, auc
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score
+
+from math import sin, cos, sqrt, atan2, radians
+
+
+# Obtener datos respecto a la fecha y obtener la edad del pozo
+def date_parser(df):
+    date_recorder = list(map(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d'),
+                             df['date_recorded'].values))
+    df['year_recorder'] = list(map(lambda x: int(x.strftime('%Y')), date_recorder))
+    df['weekday_recorder'] = list(map(lambda x: int(x.strftime('%w')), date_recorder))
+    df['yearly_week_recorder'] = list(map(lambda x: int(x.strftime('%W')), date_recorder))
+    df['month_recorder'] = list(map(lambda x: int(x.strftime('%m')), date_recorder))
+    df['age'] = df['year_recorder'].values - df['construction_year'].values
+    del df['date_recorded']
+    return df
+
+
+# Obtener a distancia a la coordenada (0,0)
+def distancia(lon1, lat1, lon2, lat2):  
+    dlon = lon2 - lon1
+    dlat = lat2 - lat1
+
+    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
+    c = 2 * atan2(sqrt(a), sqrt(1 - a))
+    R = 6371
+
+    return R * c
+
+# Obtener la coordenada cartesiana x a partir de las longitud y la latitud
+def cartesian_x(lon, lat):
+    lat=radians(lat)
+    lon=radians(lon)
+    R=6371.0
+    x = R * cos(lat) * cos(lon)
+    return x
+
+# Obtener la coordenada cartesiana y a partir de las longitud y la latitud
+def cartesian_y(lon, lat):
+    lat=radians(lat)
+    lon=radians(lon)
+    R=6371.0
+    y = R * cos(lat) * sin(lon)
+    return y
+
+# Matriz de confusion
+def plot_confusion_matrix(y_test, predictions):
+    cm = metrics.confusion_matrix(y_test, predictions)
+    plt.figure(figsize=(9,9))
+    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True)
+    plt.ylabel('Actual label')
+    plt.xlabel('Predicted label')
+    plt.show()
+
+# Funcion para realizar la validacion cruzada
+def cross_validation(clf, X, y, cv = None, min_max_scaler = False, scaled = False, standard_scaler = False, normalizer = False, poly = False, m_confusion = False):
+
+    if cv == None:
+        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)
+
+    iteration = 0
+
+    for train, test in cv.split(X, y):
+
+        X_train, X_test = X[train], X[test]
+        y_train, y_test = y[train], y[test]
+
+
+        if min_max_scaler:
+            X_train = MinMaxScaler().fit_transform(X_train)
+            X_test = MinMaxScaler().fit_transform(X_test)
+
+        if scaled:
+            X_train = scale(X_train)
+            X_test = scale(X_test)
+
+        if poly:
+            X_train = PolynomialFeatures(degree = 2, interaction_only=True).fit_transform(X_train)
+            X_test = PolynomialFeatures(degree = 2, interaction_only=True).fit_transform(X_test)
+
+        if standard_scaler:
+            transformer = StandardScaler().fit(X_train)
+            X_train = transformer.transform(X_train)
+            X_test = transformer.transform(X_test)
+
+        if normalizer:
+            transformer = Normalizer().fit(X_train)
+            X_train = transformer.transform(X_train)
+            X_test = transformer.transform(X_test)
+
+        t = time.time()
+        clf = clf.fit(X_train,y_train)
+        training_time = time.time() - t
+
+        predictions_train = clf.predict(X_train)
+        predictions = clf.predict(X_test)
+
+        print("--------- Iteración ", iteration, " --------- ")
+        print("Tiempo :: ", training_time)
+        print ("Train Accuracy :: ", accuracy_score(y_train, predictions_train))
+        print ("Test Accuracy  :: ", accuracy_score(y_test, predictions))
+        print("")
+
+        if m_confusion:
+            plot_confusion_matrix(y_test, predictions)
+
+        iteration += 1
+
+''' ------------------------------------------------------------------ '''
+''' --------------------------- READ DATA ---------------------------- '''
+''' ------------------------------------------------------------------ '''
+
+print("\nWATER PUMP COMPETITION\n")
+
+print("Leyendo datos...")
+
+#los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos)
+data_x_orig = pd.read_csv('data/water_pump_tra.csv')
+data_y = pd.read_csv('data/water_pump_tra_target.csv')
+data_x_tst = pd.read_csv('data/water_pump_tst.csv')
+
+print(data_x_orig.shape)
+print(data_x_tst.shape)
+
+print("Lectura completada.\n")
+
+
+''' ------------------------------------------------------------------ '''
+''' -------------------------- LOOK AT DATA -------------------------- '''
+''' ------------------------------------------------------------------ '''
+
+print('Viendo los datos:\n')
+
+data_x = data_x_orig
+
+print('num_private:')
+print(data_x['num_private'].value_counts()[0:3])
+print('recorded_by:')
+print(data_x['recorded_by'].value_counts())
+print(data_y.status_group.value_counts()/len(data_y))
+
+data_y.status_group.value_counts().plot(kind='bar')
+plt.xticks(rotation = 0)
+plt.show()
+
+print('Ejemplos con longitude = 0')
+print(len(data_x.ix[data_x['longitude']==0,'longitude']))
+
+print('Ejemplos con latitude = 0')
+print(len(data_x.ix[data_x['latitude']==-0.00000002,'latitude']))
+
+print('Ejemplos con construction_year = 0')
+print(len(data_x.ix[data_x['construction_year']==0,'construction_year']))
+
+
+corr = data_x.corr()
+sns.heatmap (corr)
+plt.xticks(rotation =45)
+plt.show()
+
+print("Valores perdidos:")
+print(data_x.isnull().sum())
+
+data_x.isnull().sum().plot.bar()
+plt.show()
+
+print('funder:\n')
+print(data_x['funder'].value_counts()[0:6])
+print('\ninstaller:\n')
+print(data_x['installer'].value_counts()[0:6])
+print('\npublic_meeting:\n')
+print(data_x['public_meeting'].value_counts()[0:6])
+print('\nscheme_management:\n')
+print(data_x['scheme_management'].value_counts()[0:6])
+print('\npermit:\n')
+print(data_x['permit'].value_counts()[0:6])
+print('\nsubvillage:\n')
+print(data_x['subvillage'].value_counts()[0:6])
+print('\nwpt_name:\n')
+print(data_x['wpt_name'].value_counts()[0:6])
+
+'''
+data_x['funder'].value_counts()[0:10].plot.bar()
+plt.show()
+data_x['installer'].value_counts().plot.bar()
+plt.show()
+data_x['public_meeting'].value_counts().plot.bar()
+plt.show()
+data_x['scheme_management'].value_counts().plot.bar()
+plt.show()
+data_x['permit'].value_counts().plot.bar()
+plt.show()
+data_x['subvillage'].value_counts().plot.bar()
+plt.show()
+data_x['wpt_name'].value_counts().plot.bar()
+plt.show()
+'''
+
+''' ------------------------------------------------------------------ '''
+''' ------------------------- PREPROCESSING -------------------------- '''
+''' ------------------------------------------------------------------ '''
+
+print("\nPreprocesando datos...")
+
+data_x=data_x_orig.append(data_x_tst)
+
+
+''' ------------------ DROP COLUMNS ------------------ '''
+
+print("  Borrando columnas...")
+columns_to_drop = ['id', 'num_private', 'recorded_by', 'scheme_name']
+data_x.drop(labels=columns_to_drop, axis=1, inplace = True)
+data_y.drop(labels=['id'], axis=1,inplace = True)
+
+
+
+''' ------------------ MISSING VALUES ------------------ '''
+
+print("  Modificando valores nan...")
+data_x['funder'] = data_x['funder'].fillna('Government Of Tanzania')
+data_x['installer'] = data_x['installer'].fillna('DWE')
+data_x['public_meeting'] = data_x['public_meeting'].fillna(True)
+data_x['scheme_management'] = data_x['scheme_management'].fillna('VWC')
+data_x['permit'] = data_x['permit'].fillna(True)
+data_x['subvillage'] = data_x['subvillage'].fillna('Unknown')
+data_x['wpt_name'] = data_x['wpt_name'].fillna('none')
+
+data_x.ix[data_x['latitude']>-0.1,'latitude']=None
+data_x.ix[data_x['longitude']==0,'longitude']=None
+data_x["longitude"] = data_x.groupby("region_code").transform(lambda x: x.fillna(x.median())).longitude
+data_x["latitude"] = data_x.groupby("region_code").transform(lambda x: x.fillna(x.median())).latitude
+
+data_x.construction_year=pd.to_numeric(data_x.construction_year)
+data_x.loc[data_x.construction_year <= 0, data_x.columns=='construction_year'] = 1950
+
+# mean() tarda mucho, pero mejora un poco los resultados con respecto a median()
+#data_x=data_x.fillna(data_x.mean())
+#data_x = data_x.fillna(data_x.median())
+
+''' ------------------ RARE VALUES ------------------ '''
+
+print("  Etiquetando casos raros...")
+columns_other = [x for x in data_x.columns if x not in ['latitude','longitude','gps_height','age','population','construction_year','month_recorder']]
+
+for col in columns_other:
+    value_counts = data_x[col].value_counts()
+    lessthen = value_counts[value_counts < 20]
+    listnow = data_x.installer.isin(list(lessthen.keys()))
+    data_x.loc[listnow,col] = 'Others'
+
+
+''' ------------------ CARTESIAN ------------------ '''
+
+print("  Preprocesando coordenadas y distancias...")
+data_x['dist'] = data_x.apply(lambda row: distancia(row['longitude'], row['latitude'], 0, 0), axis=1)
+data_x['cartesian_x'] = data_x.apply(lambda row: cartesian_x(row['longitude'], row['latitude']), axis=1)
+data_x['cartesian_y'] = data_x.apply(lambda row: cartesian_y(row['longitude'], row['latitude']), axis=1)
+data_x.drop(labels=['longitude', 'latitude'], axis=1, inplace = True)
+
+''' ------------------ DATES ------------------ '''
+
+print("  Preprocesando fechas...")
+data_x = date_parser(data_x)
+
+
+
+data_x.population = data_x.population.apply(lambda x: np.log10(x+1))
+
+print("  Convirtiendo categóricas a numéricas...")
+data_x = data_x.astype(str).apply(LabelEncoder().fit_transform)
+
+data_x_tst = data_x[len(data_x_orig):]
+data_x = data_x[:len(data_x_orig)]
+
+X = data_x.values
+y = np.ravel(data_y.values)
+#y = le.fit(y).transform(y)
+X_tst = data_x_tst.values
+
+print("Datos preprocesados con éxito.\n")
+
+
+''' -------------------- CROSS VALIDATION -------------------- '''
+
+'''
+print("Validación cruzada:\n")
+
+print('\nKNN\n')
+knn = KNeighborsClassifier(n_neighbors=5)
+cross_validation(clf=knn, X = X, y = y, cv = None, min_max_scaler = True)
+
+print('\nXGB\n')
+clf = xgb.XGBClassifier(n_estimators = 200)
+cross_validation(clf, X, y)
+
+print('\nLGB\n')
+clf = lgb.LGBMClassifier(objective='binary', n_estimators=200, num_leaves=31)
+cross_validation(clf, X, y)
+
+print('\nRandomForest\n')
+clf = RandomForestClassifier(n_estimators=125, max_depth = 20, random_state = 10)
+cross_validation(clf, X, y)
+
+print('\nExtraTreesClassifier\n')
+clf = ExtraTreesClassifier(n_estimators = 125, max_depth = 20)
+cross_validation(clf, X, y)
+'''
+
+''' -------------------- SUBMISSION 1 -------------------- '''
+'''
+clf = xgb.XGBClassifier(n_estimators = 200)
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission1.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+''' -------------------- SUBMISSION 2 -------------------- '''
+'''
+clf = RandomForestClassifier(n_estimators = 125)
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission2.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+''' -------------------- SUBMISSION 3 -------------------- '''
+'''
+clf = RandomForestClassifier()
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission3.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+
+''' -------------------- SUBMISSION 6 -------------------- '''
+'''
+# Eliminated features:
+# 'num_private', 'recorded_by', 'region', 'scheme_name', 'scheme_management'
+
+clf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 500, random_state=10)
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission6.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+
+''' -------------------- SUBMISSION 8 -------------------- '''
+'''
+print("Submission 8")
+
+clf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 200, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission9.csv", index=False)
+'''
+''' ---------------------------------------------------- '''
+
+''' -------------------- SUBMISSION 11 -------------------- '''
+'''
+print("Submission 11")
+
+clf = RandomForestClassifier(n_estimators=200, max_depth = 20, random_state = 10)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission11.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 12 -------------------- '''
+'''
+print("Submission 12")
+
+clf = RandomForestClassifier(n_estimators=125, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission12.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 13 -------------------- '''
+'''
+print("Submission 13")
+
+fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=20)
+estimators = range(25,201,25)
+param_dist = {'n_estimators': estimators}
+
+clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission13.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 15 -------------------- '''
+'''
+print("Submission 15")
+
+clf = RandomForestClassifier(n_estimators=125, max_depth = 22)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission15.csv", index=False)
+'''
+''' -------------------- SUBMISSION 16 -------------------- '''
+'''
+print("Submission 16")
+
+clf = RandomForestClassifier(n_estimators=500)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission16.csv", index=False)
+
+# Nota: este experimento empeora los resultados, posible sobreentrenamiento
+'''
+
+''' -------------------- SUBMISSION 17 -------------------- '''
+'''
+print("Submission 17")
+
+clf = RandomForestClassifier(n_estimators=120, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission17.csv", index=False)
+
+'''
+
+''' -------------------- SUBMISSION 18 -------------------- '''
+'''
+# fillnan() with more repeated
+print("Submission 18")
+
+clf = RandomForestClassifier(n_estimators=160, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission18.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 19 -------------------- '''
+'''
+# fillnan() with more repeated
+print("Submission 19")
+
+clf = RandomForestClassifier(n_estimators=150, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission19.csv", index=False)
+'''
+
+''' -------------------- SUBMISSION 22 -------------------- '''
+'''
+print("Submission 22")
+
+fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=20)
+estimators = range(25,201,25)
+param_dist = {'n_estimators': estimators}
+
+clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission22.csv", index=False)
+
+best_param = clf.best_params_['n_estimators']
+print ("Mejor valor para n_estimators: ", best_param)
+'''
+''' -------------------- SUBMISSION 23 -------------------- '''
+'''
+print("Submission 23")
+
+fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=25)
+estimators = range(100,1101,25)
+param_dist = {'n_estimators': estimators}
+
+clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission23.csv", index=False)
+
+best_param = clf.best_params_['n_estimators']
+print ("Mejor valor para n_estimators: ", best_param)
+'''
+
+
+''' -------------------- SUBMISSION 24 -------------------- '''
+'''
+print("Submission 24")
+
+clf = RandomForestClassifier(n_estimators=100, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission24.csv", index=False)
+
+'''
+''' -------------------- SUBMISSION 25 -------------------- '''
+'''
+print("Submission 25")
+
+clf = RandomForestClassifier(n_estimators=150, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission25.csv", index=False)
+'''
+
+
+''' ------------------- FINAL SUBMISSION ------------------ '''
+
+''' -------------------- SUBMISSION 26 -------------------- '''
+
+print("Submission 26")
+
+clf = RandomForestClassifier(n_estimators = 125, max_depth = 20)
+
+clf = clf.fit(X,y)
+
+y_pred_tst = clf.predict(X_tst)
+
+df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
+df_submission['status_group'] = y_pred_tst
+df_submission.to_csv("submission26.csv", index=False)

+ 98 - 0
data/purposeCombined/Directory/IOTA2Directory.py

@@ -0,0 +1,98 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+# =========================================================================
+#   Program:   iota2
+#
+#   Copyright (c) CESBIO. All rights reserved.
+#
+#   See LICENSE for details.
+#
+#   This software is distributed WITHOUT ANY WARRANTY; without even
+#   the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+#   PURPOSE.  See the above copyright notices for more information.
+#
+# =========================================================================
+
+
+import os
+import shutil
+from Common import ServiceConfigFile as SCF
+
+
+def GenerateDirectories(cfg):
+    """
+    generate IOTA2 output directories
+    """
+    if not isinstance(cfg, SCF.serviceConfigFile):
+        cfg = SCF.serviceConfigFile(cfg)
+
+    root = cfg.getParam('chain', 'outputPath')
+    rm_PathTEST = cfg.getParam("chain", "remove_outputPath")
+    start_step = cfg.getParam("chain", "firstStep")
+
+    if os.path.exists(root) and root != "/" and rm_PathTEST and start_step == "init":
+        shutil.rmtree(root,ignore_errors=False)
+    os.mkdir(root)
+    if os.path.exists(root+"/logs"):
+        shutil.rmtree(root+"/logs")
+    os.mkdir(root+"/logs")
+    if os.path.exists(root+"/samplesSelection"):
+        shutil.rmtree(root+"/samplesSelection")
+    os.mkdir(root+"/samplesSelection")
+    if os.path.exists(root+"/model"):
+        shutil.rmtree(root+"/model")
+    os.mkdir(root+"/model")
+    if os.path.exists(root+"/formattingVectors"):
+        shutil.rmtree(root+"/formattingVectors")
+    os.mkdir(root+"/formattingVectors")
+    if os.path.exists(root+"/config_model"):
+        shutil.rmtree(root+"/config_model")
+    os.mkdir(root+"/config_model")
+    if os.path.exists(root+"/envelope"):
+        shutil.rmtree(root+"/envelope")
+    os.mkdir(root+"/envelope")
+    if os.path.exists(root+"/classif"):
+        shutil.rmtree(root+"/classif")
+    os.mkdir(root+"/classif")
+    if os.path.exists(root+"/shapeRegion"):
+        shutil.rmtree(root+"/shapeRegion")
+    os.mkdir(root+"/shapeRegion")
+    if os.path.exists(root+"/final"):
+        shutil.rmtree(root+"/final")
+    os.mkdir(root+"/final")
+    os.mkdir(root+"/final/simplification")
+    os.mkdir(root+"/final/simplification/tiles")
+    os.mkdir(root+"/final/simplification/vectors")    
+    os.mkdir(root+"/final/simplification/tmp")
+    if os.path.exists(root+"/features"):
+        shutil.rmtree(root+"/features")
+    os.mkdir(root+"/features")
+    if os.path.exists(root+"/dataRegion"):
+        shutil.rmtree(root+"/dataRegion")
+    os.mkdir(root+"/dataRegion")
+    if os.path.exists(root+"/learningSamples"):
+        shutil.rmtree(root+"/learningSamples")
+    os.mkdir(root+"/learningSamples")
+    if os.path.exists(root+"/dataAppVal"):
+        shutil.rmtree(root+"/dataAppVal")
+    os.mkdir(root+"/dataAppVal")
+    if os.path.exists(root+"/stats"):
+        shutil.rmtree(root+"/stats")
+    os.mkdir(root+"/stats")
+    
+    if os.path.exists(root+"/cmd"):
+        shutil.rmtree(root+"/cmd")
+    os.mkdir(root+"/cmd")
+    os.mkdir(root+"/cmd/stats")
+    os.mkdir(root+"/cmd/train")
+    os.mkdir(root+"/cmd/cla")
+    os.mkdir(root+"/cmd/confusion")
+    os.mkdir(root+"/cmd/features")
+    os.mkdir(root+"/cmd/fusion")
+    os.mkdir(root+"/cmd/splitShape")
+
+    merge_final_classifications = cfg.getParam('chain', 'merge_final_classifications')
+    if merge_final_classifications:
+        if os.path.exists(root+"/final/merge_final_classifications"):
+            shutil.rmtree(root+"/final/merge_final_classifications")

+ 31 - 0
data/purposeCombined/Directory/advance_touch.py

@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Libraries
+import os
+import click
+
+@click.command()
+@click.argument('paths', nargs=-1)
+@click.option('-cd/--change', is_flag=True, default=False, help='After creating the directories, change to the new deeper directory.')
+def advance_touch(paths, cd):
+    """ Make folders and files """
+    for path in paths:
+        # Make folders
+        new_dirs = '/'.join(path.split('/')[0:-1])
+        if not os.path.exists(new_dirs) and new_dirs != '':
+            os.makedirs(new_dirs)
+        # Change directory
+        if cd:
+            cd_path = os.path.join(os.getcwd(), new_dirs) + '/'
+            os.chdir(cd_path)
+
+        # Make file
+        if not path.endswith('/') and not os.path.isfile(path):
+            try:
+                open(path, 'w+').close()
+            except IsADirectoryError:
+                pass
+
+if __name__ == '__main__':
+    advance_touch()

+ 213 - 0
data/purposeCombined/Directory/augmentation_main.py

@@ -0,0 +1,213 @@
+from __future__ import print_function, unicode_literals
+import os
+from twisted.python import filepath
+from twisted.trial import unittest
+from .. import database
+from ..database import (CHANNELDB_TARGET_VERSION, USAGEDB_TARGET_VERSION,
+                        _get_db, dump_db, DBError)
+
+class Get(unittest.TestCase):
+    def test_create_default(self):
+        db_url = ":memory:"
+        db = _get_db(db_url, "channel", CHANNELDB_TARGET_VERSION)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], CHANNELDB_TARGET_VERSION)
+
+    def test_open_existing_file(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "normal.db")
+        db = _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], CHANNELDB_TARGET_VERSION)
+        db2 = _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        rows = db2.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], CHANNELDB_TARGET_VERSION)
+
+    def test_open_bad_version(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "old.db")
+        db = _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        db.execute("UPDATE version SET version=999")
+        db.commit()
+
+        with self.assertRaises(DBError) as e:
+            _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        self.assertIn("Unable to handle db version 999", str(e.exception))
+
+    def test_open_corrupt(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "corrupt.db")
+        with open(fn, "wb") as f:
+            f.write(b"I am not a database")
+        with self.assertRaises(DBError) as e:
+            _get_db(fn, "channel", CHANNELDB_TARGET_VERSION)
+        self.assertIn("not a database", str(e.exception))
+
+    def test_failed_create_allows_subsequent_create(self):
+        patch = self.patch(database, "get_schema", lambda version: b"this is a broken schema")
+        dbfile = filepath.FilePath(self.mktemp())
+        self.assertRaises(Exception, lambda: _get_db(dbfile.path))
+        patch.restore()
+        _get_db(dbfile.path, "channel", CHANNELDB_TARGET_VERSION)
+
+    def test_upgrade(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "upgrade.db")
+        self.assertNotEqual(USAGEDB_TARGET_VERSION, 1)
+
+        # create an old-version DB in a file
+        db = _get_db(fn, "usage", 1)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], 1)
+        del db
+
+        # then upgrade the file to the latest version
+        dbA = _get_db(fn, "usage", USAGEDB_TARGET_VERSION)
+        rows = dbA.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], USAGEDB_TARGET_VERSION)
+        dbA_text = dump_db(dbA)
+        del dbA
+
+        # make sure the upgrades got committed to disk
+        dbB = _get_db(fn, "usage", USAGEDB_TARGET_VERSION)
+        dbB_text = dump_db(dbB)
+        del dbB
+        self.assertEqual(dbA_text, dbB_text)
+
+        # The upgraded schema should be equivalent to that of a new DB.
+        latest_db = _get_db(":memory:", "usage", USAGEDB_TARGET_VERSION)
+        latest_text = dump_db(latest_db)
+        with open("up.sql","w") as f: f.write(dbA_text)
+        with open("new.sql","w") as f: f.write(latest_text)
+        # debug with "diff -u _trial_temp/up.sql _trial_temp/new.sql"
+        self.assertEqual(dbA_text, latest_text)
+
+    def test_upgrade_fails(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "upgrade.db")
+        self.assertNotEqual(USAGEDB_TARGET_VERSION, 1)
+
+        # create an old-version DB in a file
+        db = _get_db(fn, "usage", 1)
+        rows = db.execute("SELECT * FROM version").fetchall()
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["version"], 1)
+        del db
+
+        # then upgrade the file to a too-new version, for which we have no
+        # upgrader
+        with self.assertRaises(DBError):
+            _get_db(fn, "usage", USAGEDB_TARGET_VERSION+1)
+
+class CreateChannel(unittest.TestCase):
+    def test_memory(self):
+        db = database.create_channel_db(":memory:")
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_preexisting(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "preexisting.db")
+        with open(fn, "w"):
+            pass
+        with self.assertRaises(database.DBAlreadyExists):
+            database.create_channel_db(fn)
+
+    def test_create(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_channel_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_create_or_upgrade(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_or_upgrade_channel_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+class CreateUsage(unittest.TestCase):
+    def test_memory(self):
+        db = database.create_usage_db(":memory:")
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_preexisting(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "preexisting.db")
+        with open(fn, "w"):
+            pass
+        with self.assertRaises(database.DBAlreadyExists):
+            database.create_usage_db(fn)
+
+    def test_create(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_usage_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_create_or_upgrade(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db = database.create_or_upgrade_usage_db(fn)
+        latest_text = dump_db(db)
+        self.assertIn("CREATE TABLE", latest_text)
+
+    def test_create_or_upgrade_disabled(self):
+        db = database.create_or_upgrade_usage_db(None)
+        self.assertIs(db, None)
+
+class OpenChannel(unittest.TestCase):
+    def test_open(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db1 = database.create_channel_db(fn)
+        latest_text = dump_db(db1)
+        self.assertIn("CREATE TABLE", latest_text)
+        db2 = database.open_existing_db(fn)
+        self.assertIn("CREATE TABLE", dump_db(db2))
+
+    def test_doesnt_exist(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        with self.assertRaises(database.DBDoesntExist):
+            database.open_existing_db(fn)
+
+class OpenUsage(unittest.TestCase):
+    def test_open(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        db1 = database.create_usage_db(fn)
+        latest_text = dump_db(db1)
+        self.assertIn("CREATE TABLE", latest_text)
+        db2 = database.open_existing_db(fn)
+        self.assertIn("CREATE TABLE", dump_db(db2))
+
+    def test_doesnt_exist(self):
+        basedir = self.mktemp()
+        os.mkdir(basedir)
+        fn = os.path.join(basedir, "created.db")
+        with self.assertRaises(database.DBDoesntExist):
+            database.open_existing_db(fn)
+

+ 92 - 0
data/purposeCombined/Directory/conftest.py

@@ -0,0 +1,92 @@
+import os
+import shutil
+
+import pytest
+
+
+def create_file(path: str, content: str):
+    """Create txt file with specific content"""
+    with open(f"{path}", "w") as file:
+        file.write(content)
+
+
+@pytest.fixture
+def create_files():
+    """Create files with equal or non-equal content"""
+    create_file("tests/file1.txt", "hello, world")
+    create_file("tests/file2.txt", "hello, world!")
+    create_file("tests/file3.txt", "hello, world")
+    yield
+    os.remove("tests/file1.txt")
+    os.remove("tests/file2.txt")
+    os.remove("tests/file3.txt")
+
+
+@pytest.fixture
+def create_dirs_and_files():
+    os.makedirs("tests/dir1/dir2")
+    os.makedirs("tests/dir3/dir4")
+    create_file("tests/dir1/file1.txt", "aaa")
+    create_file("tests/dir3/file2.txt", "bbb")
+    yield
+    shutil.rmtree("tests/dir1")
+    shutil.rmtree("tests/dir3")
+
+
+@pytest.fixture
+def create_nested_dirs_and_files_first_case():
+    """Create common case for synch function"""
+    os.makedirs("tests/source/dir1")
+    os.mkdir("tests/source/dir2")
+    os.mkdir("tests/source/dir3")
+    create_file("tests/source/dir1/file1.txt", "abacaba")
+    os.makedirs("tests/replica/dir1")
+    os.mkdir("tests/replica/dir4")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_nested_dirs_and_files_second_case():
+    """Create common case for synch function"""
+    os.makedirs("tests/source/dir1/dir2")
+    create_file("tests/source/dir1/dir2/file1.txt", "hello")
+    os.makedirs("tests/replica/dir1")
+    os.mkdir("tests/replica/dir4")
+    create_file("tests/replica/dir4/file2.txt", "hello")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_two_different_files():
+    """Create two different files"""
+    os.mkdir("tests/source")
+    os.mkdir("tests/replica")
+    create_file("tests/source/file1.txt", "aaa")
+    create_file("tests/replica/file1.txt", "bbb")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_empty_source_dir():
+    """Create empty source dir and non-empty replic's one"""
+    os.mkdir("tests/source")
+    os.makedirs("tests/replica/dir1")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")
+
+
+@pytest.fixture
+def create_empty_replica_dir():
+    """Create empty replica dir and non-empty source one"""
+    os.makedirs("tests/source/dir1/dir2")
+    os.mkdir("tests/replica")
+    yield
+    shutil.rmtree("tests/source")
+    shutil.rmtree("tests/replica")

+ 394 - 0
data/purposeCombined/Directory/data_preprocessing_utils.py

@@ -0,0 +1,394 @@
+# Customary Imports:
+import tensorflow as tf
+assert '2.' in tf.__version__  # make sure you're using tf 2.0
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import sklearn
+import skimage
+import cv2 as cv
+import os
+import datetime
+import scipy
+from skimage.morphology import reconstruction
+from skimage import exposure
+import scipy.io as sio
+import h5py
+import random
+import shutil
+import PIL
+import imageio
+import pydot 
+import graphviz
+import plotly.graph_objects as go
+import preprocess_crop
+from pathlib import Path
+from tensorflow.keras import backend as K
+from PIL import Image
+from keras.preprocessing.image import ImageDataGenerator
+from tensorflow.keras.layers import Dense, Flatten, Conv2D
+from tensorflow.keras import Model
+#from keras.utils import CustomObjectScope
+from mpl_toolkits.mplot3d import Axes3D
+import data_preprocessing_utils
+##################################################################################################################################
+'''
+DATA PREPROCESSING UTILS:
+'''
+##################################################################################################################################
+# Converting MAP Files:
+def convert_MAP(directory, output_directory, min_shape, file_format = '.npy', search_keys = None, dtype = np.float32):
+    '''
+    This program loops through given raw_data directory
+    and converts .mat files to .npy files
+    '''
+    new_dir = os.path.join(os.getcwd(), output_directory)
+    if not os.path.exists(new_dir):
+        os.mkdir(new_dir)
+    else:
+        shutil.rmtree(new_dir)
+        os.mkdir(new_dir)
+    for file in os.listdir(directory):
+        filename = os.fsdecode(file)
+        if filename.endswith(".mat"): 
+            #print(os.path.join(directory, filename))
+            filepath = os.path.join(directory, filename)
+            array_dict = {}
+            try:
+                f = h5py.File(filepath, 'r')
+            except:
+                f = sio.loadmat(filepath)
+            for k, v in f.items():
+                array_dict[k] = np.array(v, dtype = np.float32)
+            # As we only need image info from dict (the last key) we do this
+            if search_keys == None:
+                search_keys = 'map' # out of struct of .mat files want "map"
+                filtered_dict = dict(filter(lambda item: search_keys in item[0], array_dict.items()))
+            else:
+                filtered_dict = {}
+                for i in range(len(search_keys)):
+                    search_key = search_keys[i]
+                    if search_key in array_dict:
+                        filtered_dict[search_key] = array_dict[search_key]
+            if len(filtered_dict) == 0:
+                print('No Data to Meet Search Key Requirements: Datapoint Rejected -> ' + filepath)
+            else:
+                #print(list(array_dict.keys()))
+                #print(filtered_dict)
+                arrays = []
+                for k, v in filtered_dict.items():
+                    temp = np.transpose(v.astype(np.float32))
+                    # To normalize data between [-1,1], use -> arrays = arrays/(np.max(arrays)/2) - 1
+                    # To normalize data between [0,1], use -> arrays = arrays/(np.max(arrays))
+                    # To normalize data between [0,255], 
+                    #     use -> arrays = (arrays/(np.max(arrays))*255).astype(np.uint8)
+                    temp = temp/(np.max(temp))
+                    arrays.append(temp)
+                for i in range(len(arrays)):
+                    if len(arrays[i].shape) > 2:
+                        #print(arrays[i].shape)
+                        arrays[i] = np.mean(arrays[i], axis = 2)
+
+                for i in range(len(arrays)):
+                    new_dir_filepath = os.path.join(new_dir, filename.strip('.mat') 
+                                                    + '_index'+str(i) + file_format)
+                    array = arrays[i]
+                    if array.shape[0] >= min_shape[0] and array.shape[1] >= min_shape[1]:
+                        if file_format == '.npy':
+                            np.save(new_dir_filepath, array, allow_pickle=True, fix_imports=True)
+                        else:
+                            imageio.imwrite(new_dir_filepath, array)
+                    elif i == 0:
+                        print('Min Size Not Met: Datapoint Rejected -> ' + filepath)
+    return os.path.join(os.getcwd(), output_directory)
+
+##################################################################################################################################
+# Data Cleaning Procedures:
+def data_clean_func(image = None):
+    if image is not None:
+        #print(len(np.unique(image)))
+        #clean_image = image
+        '''
+        plt.hist(image)
+        plt.show()
+        '''
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('Original Image')
+        plt.show()
+        '''
+        threshold = 0.85
+        default_fill = 0.0
+        frac_of_high_clip = 1/9
+        image[image > threshold] = default_fill
+        image[image < frac_of_high_clip*(1.0-threshold)] = default_fill
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('After Clipping')
+        plt.show()
+        '''
+        image = scipy.ndimage.median_filter(image, size=(4, 4))
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('After Median Filter')
+        plt.show()
+        '''
+        image = skimage.filters.gaussian(image, sigma=0.01, output=None, mode='reflect', preserve_range=True)
+        ####################################################################
+        # Added to ensure negligible loss when converting to int16 
+        # within exposure.equalize_adapthist
+        image = (image/np.max(image)*(2**16)).astype(np.uint16)
+        # A "Monkey Patch" could possibly be used as a cleaner solution, 
+        # but would be more involved than is necessary for my application
+        ####################################################################
+        image = exposure.equalize_adapthist(image,kernel_size=image.shape[0]//8, clip_limit=0.005, nbins=2**13)
+        image = image.astype(np.float64)
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('After Local Adapt Hist')
+        plt.show()
+        '''
+        image = scipy.ndimage.median_filter(image, size=(3, 1))
+        image = scipy.ndimage.median_filter(image, size=(1, 3))
+        image = skimage.filters.gaussian(image, sigma=0.1, output=None, mode='reflect', preserve_range=True)
+        image = exposure.rescale_intensity(image, in_range='image', out_range=(0.0,1.0))
+        '''
+        plt.imshow(image, cmap='gray')
+        plt.title('Final Image')
+        plt.show()
+        '''
+        '''
+        plt.hist(image)
+        plt.show()
+        '''
+        clean_image = image.astype(np.float32)
+    else:
+        clean_image = image
+    return clean_image
+
+def data_cleaning(input_dir = 'converted_data', output_dir_name = 'cleaned_data',
+                  output_file_format ='.npy', delete_previous = True):
+    '''
+     This program seeks to remove some noise from the data
+     and make the underlying vessel structure more prominent
+     Input: input_dir -> directory that holds data to be cleaned
+            output_dir -> directory to hold cleaned data
+     Output: None
+    '''
+    file_list = os.listdir(input_dir)
+    clean_dir = os.path.join(os.getcwd(), output_dir_name)
+    if not os.path.exists(clean_dir):
+        os.mkdir(clean_dir)
+    elif delete_previous == True:
+        shutil.rmtree(clean_dir)
+        os.mkdir(clean_dir)
+    for file in file_list:
+        filename = os.fsdecode(file)
+        filepath = os.path.join(input_dir, filename)
+        if filepath.endswith('.npy'):
+            array = np.load(filepath)
+        else:
+            array = imageio.imread(filepath)
+            
+        # Defined data clean function above:
+        array = data_preprocessing_utils.data_clean_func(array)
+    
+        new_filepath = os.path.join(clean_dir, filename)
+        if output_file_format == '.npy':
+            new_filepath = Path(new_filepath)
+            new_filepath = new_filepath.with_suffix('')
+            new_filepath = new_filepath.with_suffix(output_file_format)
+            np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
+        else:
+            new_filepath = Path(new_filepath)
+            new_filepath = new_filepath.with_suffix('')
+            new_filepath = new_filepath.with_suffix(output_file_format)
+            imageio.imwrite(new_filepath, array)
+    return  
+
+    
+##################################################################################################################################
+# Data Seperation / Validation Split Procedures:
+def data_seperation(input_dir, dataset_percentages, 
+                    delete_previous = False, file_format = '.npy', 
+                    scale = 1):
+    '''
+    Takes numpy array and creates data folder with seperate sections
+    for training, validation, and testing according to given percentages
+    Input: numpy dir -> contains file path to data folder of numpy files
+           dataset_percentages -> (% train, % test) such that % train + % test = 100
+           OR
+           dataset_percentages -> (% train, % val, % test) such that % train + % val + % test = 100
+    Output: new folders for training and testing or training/validation/testing
+    '''
+    
+    # If just train and test
+    if len(dataset_percentages) == 2:
+        # Making Main data folder
+        new_dir = os.path.join(os.getcwd(), 'data')
+        if not os.path.exists(new_dir):
+            os.mkdir(new_dir)
+        
+        # Making train subfolder
+        train_dir = os.path.join(new_dir, 'train')
+        if not os.path.exists(train_dir):
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        elif delete_previous == True:
+            shutil.rmtree(train_dir)
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        
+        # Making test subfolder
+        test_dir = os.path.join(new_dir, 'test')
+        if not os.path.exists(test_dir):
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+        elif delete_previous == True:
+            shutil.rmtree(test_dir)
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+
+
+        file_list = os.listdir(input_dir)
+        total_num_imgs = len(file_list)
+        train_percent = dataset_percentages[0]
+        test_percent = dataset_percentages[1]
+        valid_inputs = (train_percent >= test_percent and train_percent <= 100 and
+                        test_percent <= 100 and train_percent > 0 and test_percent > 0 and
+                        train_percent + test_percent == 100)
+        if valid_inputs:
+            num_train = int(round(total_num_imgs * train_percent//100))
+        else:
+            num_train = int(round(total_num_imgs * 0.9))
+            print('ERROR: Please input valid percentages for dataset division')
+            print('In place of valid input the ratio 90% train, 10% test was used')
+        
+        index = 0
+        random.shuffle(file_list)
+        for file in file_list:
+            filename = os.fsdecode(file)
+            filepath = os.path.join(input_dir, filename)
+            # Loads File
+            if filepath.endswith('.npy'):
+                array = np.load(filepath)
+                array = array/np.max(array)*scale
+            else:
+                array = imageio.imread(filepath)
+                array = array/np.max(array)*scale
+            if index < num_train:
+                new_filepath = os.path.join(train_dir, filename)
+            else:
+                new_filepath = os.path.join(test_dir, filename)
+            # Saves File
+            if file_format == '.npy':
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
+            else:
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                imageio.imwrite(new_filepath, array)
+            index += 1
+        return train_dir, test_dir
+    # If train, val, and test
+    elif len(dataset_percentages) == 3:
+        # Making Main data folder
+        new_dir = os.path.join(os.getcwd(), 'data')
+        if not os.path.exists(new_dir):
+            os.mkdir(new_dir)
+            
+        # Making train subfolder
+        train_dir = os.path.join(new_dir, 'train')
+        if not os.path.exists(train_dir):
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        elif delete_previous == True:
+            shutil.rmtree(train_dir)
+            os.mkdir(train_dir)
+            train_dir = os.path.join(train_dir, 'input')
+            os.mkdir(train_dir)
+        
+        # Making val subfolder
+        val_dir = os.path.join(new_dir, 'val')
+        if not os.path.exists(val_dir):
+            os.mkdir(val_dir)
+            val_dir = os.path.join(val_dir, 'input')
+            os.mkdir(val_dir)
+        elif delete_previous == True:
+            shutil.rmtree(val_dir)
+            os.mkdir(val_dir)
+            val_dir = os.path.join(val_dir, 'input')
+            os.mkdir(val_dir)
+        
+        # Making test subfolder
+        test_dir = os.path.join(new_dir, 'test')
+        if not os.path.exists(test_dir):
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+        elif delete_previous == True:
+            shutil.rmtree(test_dir)
+            os.mkdir(test_dir)
+            test_dir = os.path.join(test_dir, 'input')
+            os.mkdir(test_dir)
+            
+        file_list = os.listdir(input_dir)
+        total_num_imgs = len(file_list)
+        train_percent = dataset_percentages[0]
+        val_percent = dataset_percentages[1]
+        test_percent = dataset_percentages[2]
+        valid_inputs = (train_percent >= test_percent and train_percent >= val_percent 
+                        and train_percent <= 100 and val_percent <= 100 and test_percent <= 100
+                        and train_percent > 0 and val_percent > 0 and test_percent > 0 and
+                        train_percent + val_percent + test_percent == 100)
+        if valid_inputs:
+            num_train = int(round(total_num_imgs * train_percent//100))
+            num_val = int(round(total_num_imgs * val_percent//100))
+        else:
+            num_train = int(round(total_num_imgs * 0.9))
+            num_val = int(round((total_num_imgs - num_train)/2))
+            print('ERROR: Please input valid percentages for dataset division')
+            print('In place of a valid input the ratio 90% train, 5% val, 5% test was used')
+        
+        index = 0
+        random.shuffle(file_list)
+        for file in file_list:
+            filename = os.fsdecode(file)
+            filepath = os.path.join(input_dir, filename)
+            # Loads File
+            if filepath.endswith('.npy'):
+                array = np.load(filepath)
+                array = array/np.max(array)*scale
+            else:
+                array = imageio.imread(filepath)
+                array = array/np.max(array)*scale
+            if index < num_train:
+                new_filepath = os.path.join(train_dir, filename)
+            elif index <= num_train + num_val:
+                new_filepath = os.path.join(val_dir, filename)
+            else:
+                new_filepath = os.path.join(test_dir, filename)
+            # Saves File
+            if file_format == '.npy':
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
+            else:
+                new_filepath = Path(new_filepath)
+                new_filepath = new_filepath.with_suffix('')
+                new_filepath = new_filepath.with_suffix(file_format)
+                imageio.imwrite(new_filepath, array)
+            index += 1
+        return train_dir, val_dir, test_dir
+    else:
+        print('ERROR: Please divide into train/test or train/val/test')
+        return None

+ 122 - 0
data/purposeCombined/Directory/diml_to_interiornet.py

@@ -0,0 +1,122 @@
+import cv2
+import os
+import shutil
+import numpy as np
+
+
+def sample_to_interiornet():
+    diml_path = "/nvme/datasets/diml_depth/scenes"
+    hd7_path = "/nvme/datasets/diml_depth/HD7"
+
+    depth_paths = [
+        "/nvme/datasets/diml_depth/train/HR/11. Bedroom/depth_filled",
+        "/nvme/datasets/diml_depth/train/HR/12. Livingroom/depth_filled"]
+
+    depth_images = []
+    for path in depth_paths:
+        depth_images += [os.path.join(path, name) for name in os.listdir(path)
+                         if os.path.isfile(os.path.join(path, name))]
+
+    scene_paths = [os.path.join(diml_path, name) for name in os.listdir(diml_path)
+                   if os.path.isdir(os.path.join(diml_path, name))]
+
+    for scene_path in scene_paths:
+        frame_paths = [os.path.join(scene_path, name) for name in os.listdir(scene_path)
+                       if os.path.isfile(os.path.join(scene_path, name))]
+
+        new_frame_path = os.path.join(hd7_path, scene_path.split('/')[-1])
+        os.mkdir(new_frame_path)
+        os.mkdir(os.path.join(new_frame_path, "cam0"))
+        os.mkdir(os.path.join(new_frame_path, "depth0"))
+        os.mkdir(os.path.join(new_frame_path, "label0"))
+        os.mkdir(os.path.join(new_frame_path, "cam0", "data"))
+        os.mkdir(os.path.join(new_frame_path, "depth0", "data"))
+        os.mkdir(os.path.join(new_frame_path, "label0", "data"))
+        print(new_frame_path)
+        for i, frame_path in enumerate(frame_paths):
+            file_name = frame_path.split('/')[-1][:-6]
+            img = cv2.imread(frame_path, cv2.IMREAD_UNCHANGED)
+            print(file_name)
+            depth_path = [path for path in depth_images if file_name in path][0]
+            depth_img = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+
+            img = cv2.resize(img, dsize=(img.shape[1] / 2, img.shape[0] / 2), interpolation=cv2.INTER_LINEAR)
+            depth_img = cv2.resize(depth_img, dsize=(depth_img.shape[1] / 2, depth_img.shape[0] / 2),
+                                   interpolation=cv2.INTER_LINEAR)
+            label_img = depth_img.copy()
+            label_img[:, :] = 3
+
+            cv2.imwrite(os.path.join(new_frame_path, "cam0", "data", "{}.png".format(i)), img)
+            cv2.imwrite(os.path.join(new_frame_path, "depth0", "data", "{}.png".format(i)), depth_img)
+            cv2.imwrite(os.path.join(new_frame_path, "label0", "data", "{}_instance.png".format(i)), label_img)
+            cv2.imwrite(os.path.join(new_frame_path, "label0", "data", "{}_nyu.png".format(i)), label_img)
+
+
+def full_to_interiornet():
+    scene_file_path = "/nvme/datasets/diml_depth/scenes.txt"
+    base_path = "/nvme/datasets/diml_depth/"
+    out_path = "/nvme/datasets/diml_depth/HD7/"
+    cam0_render = "/nvme/datasets/interiornet/3FO4IDEI1LAV_Bedroom/cam0.render"
+    num_frames = 20
+    shape = (672, 378)
+    np.random.seed(123)
+
+    with open(scene_file_path, 'r') as f:
+        scene_lines = f.readlines()
+
+    scene_lines = [sn.split('\n')[0] for sn in scene_lines]
+    scene_paths = [os.path.join(base_path, sn.split('-')[0]) for sn in scene_lines]
+    scene_ranges = [sn.split('-')[1] for sn in scene_lines]
+    scene_ranges = [(int(rn[1:-1].split(':')[0]), int(rn[1:-1].split(':')[1])) for rn in scene_ranges]
+
+    for i, scene_path in enumerate(scene_paths):
+        file_list = []
+        for j in range(scene_ranges[i][0], scene_ranges[i][1]+1):
+            scene_path_col = os.path.join(scene_path, "{}/col".format(j))
+            if os.path.exists(scene_path_col):
+                file_list += [os.path.join(scene_path_col, dn) for dn in os.listdir(scene_path_col)]
+
+        scene_count = len(os.listdir(out_path))
+        scene_out_path = "{:02d}DIML_{}".format(scene_count + 1, scene_path.split('/')[-2].split(' ')[1])
+        scene_out_path = os.path.join(out_path, scene_out_path)
+
+        if os.path.exists(scene_out_path):
+            shutil.rmtree(scene_out_path)
+        os.mkdir(scene_out_path)
+        os.mkdir(os.path.join(scene_out_path, "cam0"))
+        os.mkdir(os.path.join(scene_out_path, "depth0"))
+        os.mkdir(os.path.join(scene_out_path, "label0"))
+        os.mkdir(os.path.join(scene_out_path, "cam0", "data"))
+        os.mkdir(os.path.join(scene_out_path, "depth0", "data"))
+        os.mkdir(os.path.join(scene_out_path, "label0", "data"))
+        shutil.copyfile(cam0_render, os.path.join(scene_out_path, "cam0.render"))
+        print(scene_out_path)
+
+        frame_paths = np.random.choice(file_list, num_frames, False)
+        for j, frame_path in enumerate(frame_paths):
+            img = cv2.imread(frame_path, cv2.IMREAD_UNCHANGED)
+            depth_path = frame_path.replace('/col/', '/up_png/')
+            depth_path = depth_path.replace('_c.png', '_ud.png')
+            depth_img = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+
+            if depth_img is None:
+                print(depth_path)
+                exit()
+            if img is None:
+                print(frame_path)
+                exit()
+
+            img = cv2.resize(img, dsize=shape, interpolation=cv2.INTER_LINEAR)
+            depth_img = cv2.resize(depth_img, dsize=shape,
+                                   interpolation=cv2.INTER_LINEAR)
+            label_img = depth_img.copy()
+            label_img[:, :] = 3
+
+            cv2.imwrite(os.path.join(scene_out_path, "cam0", "data", "{}.png".format(j)), img)
+            cv2.imwrite(os.path.join(scene_out_path, "depth0", "data", "{}.png".format(j)), depth_img)
+            cv2.imwrite(os.path.join(scene_out_path, "label0", "data", "{}_instance.png".format(j)), label_img)
+            cv2.imwrite(os.path.join(scene_out_path, "label0", "data", "{}_nyu.png".format(j)), label_img)
+
+
+if __name__ == '__main__':
+    full_to_interiornet()

+ 177 - 0
data/purposeCombined/Directory/ego_to_json.py

@@ -0,0 +1,177 @@
+import os 
+import shutil 
+import json
+import scipy.io
+import random
+
+# ego_to_json.py maakt egohands_data bruikbaar om om te laten zetten naar de verschillende
+# formaten voor de netwerken.
+#drie folders, train, test, val
+
+# |-- train
+# |  | -- images
+# |  | -- annotations.json
+# |-- val
+# |  | -- images
+# |  | -- annotations.json
+# |-- test
+# |  | -- images
+# |  | -- annotations.json
+
+
+# annotations.json:
+# { 
+#     "CARDS_OFFICE_H_T_frame_0001.jpg": 
+#     {
+#         "name": "CARDS_OFFICE_H_T_frame_0001.jpg",
+#         "objects": [[]]
+#     },
+#     "CARDS_OFFICE_H_T_frame_0002.jpg":
+#     {
+#         "name": "CARDS_OFFICE_H_T_frame_0002.jpg",
+#         "objects": [[]]
+#     }
+# }
+
+ROOT_DIR = "../egohands_data"
+ANNOTATION_FILE = "polygons.mat"
+SAVE_FILE = "annotations.json"
+
+
+def split_test():
+    os.makedirs(os.path.join(ROOT_DIR, "test"))
+    os.makedirs(os.path.join(ROOT_DIR, "val"))
+    os.makedirs(os.path.join(ROOT_DIR, "train"))
+    
+    shutil.move(os.path.join(ROOT_DIR, "CARDS_COURTYARD_B_T"), os.path.join(ROOT_DIR, "test", "CARDS_COURTYARD_B_T"))
+    shutil.move(os.path.join(ROOT_DIR, "CARDS_OFFICE_S_B"), os.path.join(ROOT_DIR, "test", "CARDS_OFFICE_S_B"))
+    shutil.move(os.path.join(ROOT_DIR, "CHESS_COURTYARD_B_T"), os.path.join(ROOT_DIR, "test", "CHESS_COURTYARD_B_T"))
+    shutil.move(os.path.join(ROOT_DIR, "CHESS_LIVINGROOM_T_H"), os.path.join(ROOT_DIR, "test", "CHESS_LIVINGROOM_T_H"))   
+    shutil.move(os.path.join(ROOT_DIR, "JENGA_LIVINGROOM_S_T"), os.path.join(ROOT_DIR, "test", "JENGA_LIVINGROOM_S_T"))
+    shutil.move(os.path.join(ROOT_DIR, "JENGA_OFFICE_H_T"), os.path.join(ROOT_DIR, "test", "JENGA_OFFICE_H_T"))
+    shutil.move(os.path.join(ROOT_DIR, "PUZZLE_COURTYARD_H_T"), os.path.join(ROOT_DIR, "test", "PUZZLE_COURTYARD_H_T"))
+    shutil.move(os.path.join(ROOT_DIR, "PUZZLE_LIVINGROOM_T_B"), os.path.join(ROOT_DIR, "test", "PUZZLE_LIVINGROOM_T_B"))
+
+    shutil.move(os.path.join(ROOT_DIR, "CARDS_LIVINGROOM_S_H"), os.path.join(ROOT_DIR, "val", "CARDS_LIVINGROOM_S_H"))
+    shutil.move(os.path.join(ROOT_DIR, "CHESS_COURTYARD_H_S"), os.path.join(ROOT_DIR, "val", "CHESS_COURTYARD_H_S"))
+    shutil.move(os.path.join(ROOT_DIR, "JENGA_COURTYARD_T_S"), os.path.join(ROOT_DIR, "val", "JENGA_COURTYARD_T_S"))
+    shutil.move(os.path.join(ROOT_DIR, "PUZZLE_OFFICE_S_T"), os.path.join(ROOT_DIR, "val", "PUZZLE_OFFICE_S_T"))
+
+    train = ['CARDS_COURTYARD_H_S','CARDS_COURTYARD_S_H','CARDS_COURTYARD_T_B','CARDS_LIVINGROOM_B_T','CARDS_LIVINGROOM_H_S','CARDS_LIVINGROOM_T_B','CARDS_OFFICE_B_S','CARDS_OFFICE_H_T','CARDS_OFFICE_T_H','CHESS_COURTYARD_S_H','CHESS_COURTYARD_T_B','CHESS_LIVINGROOM_B_S','CHESS_LIVINGROOM_H_T','CHESS_LIVINGROOM_S_B','CHESS_OFFICE_B_S','CHESS_OFFICE_H_T','CHESS_OFFICE_S_B','CHESS_OFFICE_T_H','JENGA_COURTYARD_B_H','JENGA_COURTYARD_H_B','JENGA_COURTYARD_S_T','JENGA_LIVINGROOM_B_H','JENGA_LIVINGROOM_H_B','JENGA_LIVINGROOM_T_S','JENGA_OFFICE_B_S','JENGA_OFFICE_S_B','JENGA_OFFICE_T_H','PUZZLE_COURTYARD_B_S','PUZZLE_COURTYARD_S_B','PUZZLE_COURTYARD_T_H','PUZZLE_LIVINGROOM_B_T','PUZZLE_LIVINGROOM_H_S','PUZZLE_LIVINGROOM_S_H','PUZZLE_OFFICE_B_H','PUZZLE_OFFICE_H_B','PUZZLE_OFFICE_T_S']
+
+    for folder in train:
+        shutil.move(os.path.join(ROOT_DIR, folder), os.path.join(ROOT_DIR, "train", folder))
+
+def json_test():
+    # test_dir = os.path.join(ROOT_DIR, "test")
+    # os.makedirs(os.path.join(test_dir, "images"))
+    # img_dir = os.path.join(test_dir, "images")
+
+    # create_annotations(test_dir,img_dir)
+
+    # val_dir = os.path.join(ROOT_DIR, "val")
+    # os.makedirs(os.path.join(val_dir, "images"))
+    # img_dir = os.path.join(val_dir, "images")
+
+    # create_annotations(val_dir,img_dir)
+
+    train_dir = os.path.join(ROOT_DIR, "train")
+    # os.makedirs(os.path.join(train_dir, "images"))
+    img_dir = os.path.join(train_dir, "images")
+
+    create_annotations(train_dir,img_dir)
+
+   
+def json_train_val():
+    os.makedirs(os.path.join(ROOT_DIR, "tmp"))
+    tmp_dir = os.path.join(ROOT_DIR, "tmp")
+    os.makedirs(os.path.join(tmp_dir, "images"))
+    img_dir = os.path.join(tmp_dir, "images")
+
+    for dir_name in os.listdir(ROOT_DIR):
+        if not (dir_name == "tmp" or dir_name == "test"):
+            shutil.move(os.path.join(ROOT_DIR, dir_name), os.path.join(ROOT_DIR, tmp_dir, dir_name))
+
+    create_annotations(tmp_dir, img_dir)
+
+def create_annotations(directory, img_dir):
+    annotations = {}
+    for dir_name in os.listdir(directory):
+        if not (dir_name == "images"):
+            for _, _, files in os.walk(os.path.join(directory, dir_name)):
+                mat = scipy.io.loadmat(os.path.join(directory, dir_name, ANNOTATION_FILE))
+
+                for i, img_file in enumerate(sorted(files)):
+                    if not (img_file.endswith(".mat")):
+                        new_img_file = dir_name + "_" + img_file
+
+                        image = {
+                            "name":     new_img_file,
+                            "objects":  []
+                        }
+
+                        for segmentation in mat["polygons"][0][i]:
+                            if segmentation.any():
+                                image["objects"].append(segmentation.tolist())
+                        
+                        annotations[new_img_file] = image
+
+                        shutil.move(os.path.join(directory, dir_name, img_file), os.path.join(img_dir, new_img_file))
+
+    with open(os.path.join(directory, SAVE_FILE), 'w') as output_json_file:
+        json.dump(annotations, output_json_file)
+
+    for dir_name in os.listdir(directory):
+        if not (dir_name == "images" or dir_name == "annotations.json"):
+            shutil.rmtree(os.path.join(directory, dir_name))
+
+def split_train_val():    
+    tmp_dir = os.path.join(ROOT_DIR, "tmp")
+    
+    os.makedirs(os.path.join(ROOT_DIR, "train"))
+    train_dir = os.path.join(ROOT_DIR, "train")
+    os.makedirs(os.path.join(train_dir, "images"))
+    
+    os.makedirs(os.path.join(ROOT_DIR, "val"))
+    val_dir = os.path.join(ROOT_DIR, "val")
+    os.makedirs(os.path.join(val_dir, "images"))
+
+    # Opening JSON file
+    with open(os.path.join(tmp_dir, 'annotations.json')) as json_file:
+        data = json.load(json_file)
+
+        # 0.1765 is 15% van 100% omdat test al 20 % is (niet helemaal)
+        val_keys = random.sample(list(data), round(len(data) * 0.1765))
+
+        validation = {k: v for k, v in data.items() if k in val_keys}
+        train = {k: v for k, v in data.items() if k not in val_keys}
+
+    with open(os.path.join(val_dir, SAVE_FILE), 'w') as output_json_file:
+        json.dump(validation, output_json_file)
+
+    with open(os.path.join(train_dir, SAVE_FILE), 'w') as output_json_file:
+        json.dump(train, output_json_file)
+        
+    for key, _ in validation.items():
+        shutil.move(os.path.join(tmp_dir, "images", key), os.path.join(val_dir, "images", key))
+
+    for key, _ in train.items():
+        shutil.move(os.path.join(tmp_dir, "images", key), os.path.join(train_dir, "images"))
+
+    shutil.rmtree(tmp_dir)
+
+def move_to_folder():
+    os.makedirs(os.path.join(ROOT_DIR, "json"))
+    json_dir = os.path.join(ROOT_DIR, "json")
+    shutil.move(os.path.join(ROOT_DIR, "test"), json_dir)
+    shutil.move(os.path.join(ROOT_DIR, "val"), json_dir)
+    shutil.move(os.path.join(ROOT_DIR, "train"), json_dir)
+
+    shutil.move(ROOT_DIR, "../data")
+
+
+# split_test()
+json_test()
+# json_train_val()
+# split_train_val()
+move_to_folder()

+ 107 - 0
data/purposeCombined/Directory/esquema.py

@@ -0,0 +1,107 @@
+import errno
+import os
+from flask import jsonify
+
+def crearFacultad(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        
+        os.mkdir('resources/'+fac_nombre)
+         
+    except OSError:
+        
+        return jsonify({"message":"error al crear facultad"}),500
+    
+    else:
+        
+        return jsonify({"message":"facultad creada"}),200
+    
+
+def crearCarrera(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        car_nombre= json_req['car_nombre']
+        
+        os.mkdir('resources/'+fac_nombre+'/'+car_nombre)
+        
+    except OSError:
+        
+        return jsonify({"message":"error al crear carrera"}),500
+    
+    else:
+        
+        return jsonify({"message":"carrera creada"}),200
+
+
+def crearAsignatura(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        car_nombre= json_req['car_nombre']
+        asig_identificador=json_req['asig_identificador']
+        
+        path=('resources/'+fac_nombre+'/'+car_nombre+'/'+asig_identificador+"/")
+        
+        if not os.path.isdir(path):
+            os.makedirs(path)
+        
+        os.mkdir(path+'Portafolios')
+        
+    except OSError as e:
+        print(e.strerror)
+        return jsonify({"message":"error al crear asignatura"}),500
+
+    else:
+        
+        return jsonify({"message":"asignatura creada"}),200
+    
+
+def crearPortafolio(request):
+    
+    try:
+        
+        json_req = request.json
+        fac_nombre = json_req['fac_nombre']
+        car_nombre= json_req['car_nombre']
+        asig_identificador=json_req['asig_identificador']
+        per_cedula=json_req['per_cedula']
+        
+        pathCedula=('resources/'+fac_nombre+'/'+car_nombre+'/'+asig_identificador+'/Portafolios/'+per_cedula)
+        os.mkdir(pathCedula)
+        
+        pathDatosInf=(pathCedula+'/1. Datos informativos')
+        os.mkdir(pathDatosInf)
+        
+        pathElmentosCurri=(pathCedula+'/2. Elementos curriculares')
+        os.mkdir(pathElmentosCurri)
+        os.mkdir(pathElmentosCurri+'/a. Syllabus')
+        os.mkdir(pathElmentosCurri+'/b. Expectativas')
+        os.mkdir(pathElmentosCurri+'/c. Apuntes de clase')
+        os.mkdir(pathElmentosCurri+'/d. Evaluaciones')
+        os.mkdir(pathElmentosCurri+'/e. Investigaciones')
+        os.mkdir(pathElmentosCurri+'/f. Actividades de experimentación')
+        os.mkdir(pathElmentosCurri+'/g. Proyectos')
+        os.mkdir(pathElmentosCurri+'/h. Estudios de caso')
+        os.mkdir(pathElmentosCurri+'/i. Planteamiento de problemas')
+        os.mkdir(pathElmentosCurri+'/j. Registro de asistencia')
+        os.mkdir(pathElmentosCurri+'/k. Registro de observaciones')
+        os.mkdir(pathElmentosCurri+'/l. Tareas intraclases')
+        os.mkdir(pathElmentosCurri+'/m. Tareas autónomas')
+        os.mkdir(pathElmentosCurri+'/n. Tareas de refuerzo')
+        
+        pathInformeFin=(pathCedula+'/3. Informe final')
+        os.mkdir(pathInformeFin)
+    
+    except OSError as error:
+        print(error)
+        return jsonify({"message":"error al crear portafolio"}),500
+    else:
+        return jsonify({"message":"portafolio creado"}),200  

+ 41 - 0
data/purposeCombined/Directory/file_handler.py

@@ -0,0 +1,41 @@
+import os
+import time
+import traceback
+
+def file_storage(file_path,suffix):
+    r"""
+        file_path :: The file absolute path
+        suffix :: filename
+
+        file_path=C:\Users\Desktop\video_
+        filename = abc.py
+        return C:\Users\Desktop\video_2020\12\12\abc.py
+    """
+    tm = time.localtime(time.time())
+    # 获取系统当前年,月,日,小时
+    year = time.strftime('%Y', tm)
+    month = time.strftime('%m', tm)
+    day = time.strftime('%d', tm)
+    # 根据当前日期创建图片文件
+    file_year = file_path + '/' + year
+    file_month = file_year + '/' + month
+    file_day = file_month + '/' + day
+    # 判断路径是否存在,没有则创建
+    if not os.path.exists(file_path):
+        os.makedirs(file_path)
+        os.mkdir(file_year)
+        os.mkdir(file_month)
+        os.mkdir(file_day)
+    else:
+        if not os.path.exists(file_year):
+            os.mkdir(file_year)
+            os.mkdir(file_month)
+            os.mkdir(file_day)
+        else:
+            if not os.path.exists(file_month):
+                os.mkdir(file_month)
+                os.mkdir(file_day)
+            else:
+                if not os.path.exists(file_day):
+                    os.mkdir(file_day)
+    return os.path.join(file_day,suffix)

+ 130 - 0
data/purposeCombined/Directory/generate_directories.py

@@ -0,0 +1,130 @@
+"""
+Taken from - https://github.com/alexhamiltonRN
+"""
+from pathlib import Path
+
+def generate_patient_ids(dataset_type):
+    """
+    This function generates the patient_ids for the directories to be created below. 
+    Ids are extracted from the raw dataset file structure.
+    """
+    
+    patient_ids = []
+    path_to_date = Path()
+    
+    if dataset_type == str(1):
+        path_to_data = Path('E:/Memoire/ProstateX/train-data')
+    else:
+        path_to_data = Path('E:/Memoire/ProstateX/test-data')
+    
+    # Get list of patient_ids in folder
+    patient_folders = [x for x in path_to_data.iterdir() if x.is_dir()]
+    for patient_folder in patient_folders:
+        patient_ids.append(str(patient_folder.stem))
+    return patient_ids 
+
+def generate_nifti_ds(patient_ids, dataset_type):
+    """
+    This function generates the directory structure for the nifti files
+    generated from the dicom files.
+
+    Directory structure for generated data:
+    ProstateX/generated/train/nifti
+    ProstateX/generated/test/nifti
+    """
+    for patient_id in patient_ids:
+        if dataset_type == str(1):
+            new_path = Path(str('E:/Memoire/ProstateX/generated/train/nifti/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+        else:
+            new_path = Path(str('E:/Memoire/ProstateX/generated/test/nifti/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+def generate_nifti_resampled_ds(patient_ids, dataset_type):
+    """
+    This function generates the directory structure for the nifti files
+    generated from the dicom files.
+
+    Directory structure for generated data:
+    ProstateX/generated/train/nifti_resampled
+    ProstateX/generated/test/nifti_resampled
+    """
+    for patient_id in patient_ids:
+        if dataset_type == str(1):
+            new_path = Path(str('E:/Memoire/ProstateX/generated/train/nifti_resampled/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+        else:
+            new_path = Path(str('E:/Memoire/ProstateX/generated/test/nifti_resampled/' + patient_id))
+            new_path.mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+            new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+
+def generate_numpy_ds(dataset_type):
+    """
+    This function generates the directory structure for the final numpy
+    arrays for the training and test sets. 
+    
+    Director structure for processed data:
+    ProstateX/generated/train/numpy
+    ProstateX/generated/test/numpy
+    """
+    if dataset_type == str(1):
+        new_path = Path('E:/Memoire/ProstateX/generated/train/numpy/')
+        new_path.mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+    else:
+        new_path = Path('E:/Memoire/ProstateX/generated/test/numpy/')
+        new_path.mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('t2').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('bval').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('adc').mkdir(parents = True, exist_ok = True)
+        new_path.joinpath('ktrans').mkdir(parents = True, exist_ok = True)
+        
+def generate_dataframe_ds(dataset_type):
+    if dataset_type == str(1):
+        new_path = Path('E:/Memoire/ProstateX/generated/train/dataframes/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+    else:
+        new_path = Path('E:/Memoire/ProstateX/generated/test/dataframes/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+def generate_logs_ds(dataset_type):
+    if dataset_type == str(1):
+        new_path = Path('E:/Memoire/ProstateX/generated/train/logs/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+    else:
+        new_path = Path('E:/Memoire/ProstateX/generated/test/logs/')
+        new_path.mkdir(parents = True, exist_ok = True)
+
+def main():
+    dataset_type = input('Generate directory structure for which type of data (1-Train; 2-Test):')
+    patient_ids = generate_patient_ids(dataset_type)
+    generate_nifti_ds(patient_ids, dataset_type)
+    generate_nifti_resampled_ds(patient_ids, dataset_type)
+    generate_numpy_ds(dataset_type)
+    generate_dataframe_ds(dataset_type)
+    generate_logs_ds(dataset_type)
+    print('Done creating directory structure...')
+
+main()

+ 167 - 0
data/purposeCombined/Directory/logging.py

@@ -0,0 +1,167 @@
+import os
+
+from datetime import datetime
+from django.conf import settings
+from django.core.files import File
+
+
+def set():
+    if not os.path.exists(settings.MEDIA_ROOT):
+        try:
+            os.mkdir(settings.MEDIA_ROOT)
+        except OSError:
+            return
+
+    if not os.path.exists(settings.MEDIA_ROOT+'/download'):
+        try:
+            os.mkdir(settings.MEDIA_ROOT+'/download')
+        except OSError:
+            return
+
+    if not os.path.exists(settings.BASE_DIR + "/log"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log")
+        except OSError:
+            return
+    if not os.path.exists(settings.BASE_DIR + "/log/message"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log/message")
+        except OSError:
+            return
+    if not os.path.exists(settings.BASE_DIR + "/log/error"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log/error")
+        except OSError:
+            return
+    if not os.path.exists(settings.BASE_DIR + "/log/log"):
+        try:
+            os.mkdir(settings.BASE_DIR + "/log/log")
+        except OSError:
+            return
+    if not os.path.exists(settings.MEDIA_ROOT + "/tgbot"):
+        try:
+            os.mkdir(settings.MEDIA_ROOT + "/tgbot")
+        except OSError:
+            return
+
+
+
+def message(message):
+    DirLogs = settings.BASE_DIR + "/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    DirLogs = settings.BASE_DIR + "/log/message"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    date = datetime.now()
+    month = "0" if date.month < 10 else ""
+    month += str(date.month)
+    day = "0" if date.day < 10 else ""
+    day += str(date.day)
+    StrDate = "%s%s%s" % (str(date.year), month, day)
+    file = open(DirLogs + '/message_' + StrDate + '.log', 'a')
+    my_file = File(file)
+    my_file.write("[%s]: %s\n" % (
+        str(datetime.now().strftime("%d-%m-%Y %H:%M:%S")),
+        str(message)
+        ))
+    my_file.closed
+    file.closed
+
+
+def log(message):
+    DirLogs = settings.BASE_DIR + "/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    DirLogs = settings.BASE_DIR + "/log/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    date = datetime.now()
+    month = "0" if date.month < 10 else ""
+    month += str(date.month)
+    day = "0" if date.day < 10 else ""
+    day += str(date.day)
+    StrDate = "%s%s%s" % (str(date.year), month, day)
+    file = open(DirLogs + '/message_' + StrDate + '.log', 'a')
+    my_file = File(file)
+    my_file.write("[%s]: %s\n" % (
+        str(datetime.now().strftime("%d-%m-%Y %H:%M:%S")),
+        str(message)))
+    my_file.closed
+    file.closed
+
+
+def error(message):
+    DirLogs = settings.BASE_DIR + "/log"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    DirLogs = settings.BASE_DIR + "/log/error"
+    if not os.path.exists(DirLogs):
+        try:
+            os.mkdir(DirLogs)
+        except OSError:
+            return
+    date = datetime.now()
+    month = "0" if date.month < 10 else ""
+    month += str(date.month)
+    day = "0" if date.day < 10 else ""
+    day += str(date.day)
+    StrDate = "%s%s%s" % (str(date.year), month, day)
+    file = open(DirLogs + '/errors_' + StrDate + '.log', 'a')
+    my_file = File(file)
+    my_file.write("[%s]: %s\n" % (
+        str(datetime.now().strftime("%d-%m-%Y %H:%M:%S")),
+        str(message)))
+    my_file.closed
+    file.closed
+
+
+def check_dir():
+    try:
+        if not os.path.exists(settings.MEDIA_ROOT):
+            try:
+                os.mkdir(settings.MEDIA_ROOT)
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.MEDIA_ROOT+"/att"):
+            try:
+                os.mkdir(settings.MEDIA_ROOT+"/att")
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.MEDIA_ROOT+"/att/biophoto"):
+            try:
+                os.mkdir(settings.MEDIA_ROOT+"/att/biophoto")
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.ATT_ROOT):
+            try:
+                os.mkdir(settings.ATT_ROOT)
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+        if not os.path.exists(settings.ATT_ROOT+"/USERPIC"):
+            try:
+                os.mkdir(settings.ATT_ROOT+"/USERPIC")
+            except OSError:
+                logging.error(traceback.format_exc())
+                return
+    except Exception as err:
+        logging.error('%s\n%s' % (traceback.format_exc(), str(err)))

+ 27 - 0
data/purposeCombined/Directory/make_folder.py

@@ -0,0 +1,27 @@
+import os
+def make_folder(dealername):
+    os.getcwd()
+    #'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS'
+    install_dir = 'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS'
+    os.chdir(install_dir)
+    #dealername = "Rene motors"
+    dealername_no_space = dealername.replace(" ", "_")
+    dealername_no_space
+    #'Don_Ayres_Honda'
+    dealer_folder = dealername_no_space[:1]
+    dealer_folder
+    #'D'
+    os.chdir(dealer_folder)
+    os.getcwd()
+    #'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS\\D'
+    dealername_spaces = dealername_no_space.replace("_", " ")
+    dealername_spaces
+    #'Don Ayres Honda'
+    os.mkdir(dealername_spaces)
+    os.chdir(dealername_spaces)
+    os.getcwd()
+    #'C:\\Users\\corcoras\\Desktop\\FY14 INSTALLS\\D\\Don Ayres Honda'
+    os.mkdir("config")
+    os.mkdir("original")
+    os.mkdir("final")
+    print(f"\nFolder was created : {install_dir}\{dealer_folder}\{dealername_spaces}")

+ 90 - 0
data/purposeCombined/Directory/mkdir.py

@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+
+"""
+Pydir is mkdir for Python modules.
+
+Example:
+    $ pydir -v myproject/module/etc
+    Created directory myproject/module/etc
+    Created file myproject/__init__.py
+    Created file myproject/module/__init__.py
+    Created file myproject/module/etc/__init__.py
+"""
+
+
+from optparse import OptionParser, make_option
+import os
+import os.path
+import sys
+
+
+VERSION = (0, 2, 1)
+
+
+def version_string():
+    return '.'.join(str(component) for component in VERSION)
+
+
+def main():
+    usage = '%prog path [path2] [path3] [pathN]\n\n' + __doc__.strip()
+    parser = OptionParser(usage=usage, option_list=(
+        make_option('-v', '--verbose', default=False, action='store_true'),
+    ))
+    
+    options, args = parser.parse_args()
+    
+    if len(args) == 0:
+        parser.error('No paths given.')
+    
+    output = sys.stdout if options.verbose else None
+    
+    for index, path in enumerate(args):
+        path = path.replace('.', os.path.sep)
+        
+        if output and index > 0:
+            output.write('\n')
+        
+        try:
+            pydir(path, output=output)
+        except BaseException as exc:
+            print ('Couldn\'t create %s: %s' % (path, exc,))
+
+
+def pydir(path, output=None):
+    """
+    Create a directory structure for a Python module, including __init__.py
+    files. Converts existing directories into modules.
+    """
+    
+    def info(line):
+        if output:
+            output.write(line)
+            output.write('\n')
+    
+    try:
+        os.makedirs(path)
+    except (OSError, IOError) as exc:
+        if not os.path.isdir(path):
+            info('Path already exists: %s' % path)
+        else:
+            raise
+    else:
+        info('Created directory %s' % path)
+    
+    segments = path.split(os.path.sep)
+    for i in xrange(len(segments)):
+        init_filename = os.path.sep.join(segments[:i+1] + ['__init__.py'])
+        if not os.path.isfile(init_filename):
+            try:
+                open(init_filename, 'w').close()
+            except (OSError, IOError) as exc:
+                raise
+            else:
+                info('Created file %s' % (init_filename,))
+        else:
+            info('File already exists: %s' % (init_filename,))
+
+
+if __name__ == '__main__':
+    main()

+ 135 - 0
data/purposeCombined/Directory/mkdirPypi.py

@@ -0,0 +1,135 @@
+
+
+                        #********************************************************************************#
+                        #                                                                                #
+                        #                                  нεℓℓσ,вαтεs!                                  #
+                        #                                                                                #
+                        #   filename: mkdirPypi.py                                                       #
+                        #   created: 2022-03-10                                                          #
+                        #   system: Windows                                                              #
+                        #   version: 64bit                                                               #
+                        #                                       by: Bates <https://github.com/batestin1> #
+                        #********************************************************************************#
+                        #                           import your librarys below                           #
+                        #********************************************************************************#
+
+from pathlib import Path
+from datetime import date
+import getpass
+import platform
+import subprocess
+
+def mkdirPypi(file):
+    users=getpass.getuser()
+    res = subprocess.run(["git", "config", "user.name"], stdout=subprocess.PIPE)
+    git_username = res.stdout.strip().decode()
+    filename = file.replace(' ', '_')
+    #create a home directory#
+    cd = 'Codigo fonte'
+    dw = 'Download'
+    linkGit = f'https://github.com/{git_username}/'
+    codigo_fonte = f"{cd} : {linkGit}"
+    download = f"{dw} : {linkGit}"
+    project_urls = {codigo_fonte, download}
+    path = Path(f"./{filename}")
+    path.mkdir(parents=True, exist_ok=True)
+    data_atual = date.today()
+    data = f"""{data_atual.strftime('%Y-%m-%d')}"""
+
+    #### create a LICENSE ####
+    textLic ="""
+MIT License
+Copyright (c) 2018 Yan Orestes
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge,publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+    """
+    licensa = open(f"{filename}/LICENSE", "w")
+    licensa.write(textLic)
+
+    #### create a README.md ###
+
+    textReadm = f"""
+<h1 align="center">
+<img src="https://img.shields.io/static/v1?label={filename.upper()}%20POR&message={users}&color=7159c1&style=flat-square&logo=ghost"/>
+<h3> <p align="center">{filename.upper()} </p> </h3>
+<h3> <p align="center"> ================= </p> </h3>
+>> <h3> Resume </h3>
+<p> text here </p>
+>> <h3> How install </h3>
+```
+code here
+```
+>> <h3> How Works </h3>
+```
+code here
+```
+    """
+    readme = open(f"{filename}/README.md", "w")
+    readme.write(textReadm)
+
+    ###setup.cfg###
+
+    cfgTxt = """
+[metadata]
+description-file = README.md
+license_file = LICENSE.txt
+"""
+    cfgsetup = open(f"{filename}/setup.cfg", "w")
+    cfgsetup.write(cfgTxt)
+
+    ###setup.py ######
+
+    setupyT = f"""
+from setuptools import setup
+setup(
+    name = '{filename}',
+    version = '1.0.0',
+    author = '{users}',
+    author_email = '{users}@mailer.com.br',
+    packages = ['{filename}'],
+    description = 'a way to make your life easier',
+    long_description = 'file: README.md',
+    url = 'https://github.com/{git_username}/',
+    project_urls = {project_urls},
+    keywords = 'a way to make your life easier',
+    classifiers = []
+)"""
+
+    setupy = open(f"{filename}/setup.py", "w")
+    setupy.write(setupyT)
+
+    #### create dir #####
+
+    path = Path(f"./{filename}/{filename}")
+    path.mkdir(parents=True, exist_ok=True)
+    txtnull=f"""
+#############################################################################################################################
+#   filename:{filename}.py                                                       
+#   created: {data}                                                              
+#   import your librarys below                                                    
+#############################################################################################################################
+
+
+def {filename}():
+    pass
+    """
+
+    main = open(f"{filename}/{filename}/{filename}.py", "w")
+    main.write(txtnull)
+
+
+    txtnull2=f"""
+#############################################################################################################################
+#   filename:{filename}.py                                                       
+#   created: {data}                                                              
+#   import your librarys below                                                    
+#############################################################################################################################
+
+
+
+from .{filename} import *
+
+    """
+    init = open(f"{filename}/{filename}/__init__.py", "w")
+    init.write(txtnull2)
+
+    print(f"your project call {filename} was create to be upper on Pypi")

+ 12 - 0
data/purposeCombined/Directory/mkdir_p.py

@@ -0,0 +1,12 @@
+import os
+import errno
+
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise  

+ 80 - 0
data/purposeCombined/Directory/project_creator.py

@@ -0,0 +1,80 @@
+############################################################################
+##### Transposon Annotator reasonaTE - part of Transposon Ultimate #########
+##### Kevin Riehl (kevin.riehl.de@gmail.com, 2021) #########################
+############################################################################
+
+# Imports
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from os import path
+import os.path
+
+# Methods
+def make_rc_record(record):
+    return SeqRecord(seq = record.seq.reverse_complement(), id = record.id, description="")
+
+def copySequenceClean(fromFile,projectFolderPath):
+    # Copy sequence and clean heads
+    f1 = open(fromFile,"r")
+    f2 = open(os.path.join(projectFolderPath,"sequence.fasta"),"w+")
+    f3 = open(os.path.join(projectFolderPath,"sequence_heads.txt"),"w+")
+    line = f1.readline()
+    counter = 0
+    while line!="":
+        if(line.startswith(">")):
+            counter += 1
+            f3.write(">seq"+str(counter)+"\t"+line)
+            f2.write(">seq"+str(counter)+"\n")
+        else:
+            f2.write(line.upper())
+        line = f1.readline()
+    f1.close()
+    f2.close()
+    f3.close()
+    # Create reverse complement Fasta file
+    records = map(make_rc_record, SeqIO.parse(os.path.join(projectFolderPath,"sequence.fasta"), "fasta"))
+    SeqIO.write(records, os.path.join(projectFolderPath,"sequence_rc.fasta"), "fasta")
+    records = map(make_rc_record, SeqIO.parse(os.path.join(projectFolderPath,"sequence_rc.fasta"), "fasta"))
+    SeqIO.write(records, os.path.join(projectFolderPath,"sequence.fasta"), "fasta")
+    
+def createProject(projectFolder, projectName, inputFasta):
+    # Check if project folder exists
+    if(not path.isdir(projectFolder)):
+        os.mkdir(projectFolder)    
+    # Check if given project already exits
+    projectFolderPath = os.path.join(projectFolder,projectName)
+    if(path.isdir(projectFolderPath)):
+        print("Project already exists, process aborted")
+        return "EXIT"
+    os.mkdir(projectFolderPath)
+    # Create folder structure for annotation softwares
+    os.mkdir(os.path.join(projectFolderPath,"tirvish"))
+    os.mkdir(os.path.join(projectFolderPath,"tirvish_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"sinescan"))
+    os.mkdir(os.path.join(projectFolderPath,"sinefind"))
+    os.mkdir(os.path.join(projectFolderPath,"sinefind_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"repMasker"))
+    os.mkdir(os.path.join(projectFolderPath,"repeatmodel"))
+    os.mkdir(os.path.join(projectFolderPath,"must"))
+    os.mkdir(os.path.join(projectFolderPath,"mitetracker"))
+    os.mkdir(os.path.join(projectFolderPath,"mitetracker_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"mitefind"))
+    os.mkdir(os.path.join(projectFolderPath,"mitefind_rc"))
+    os.mkdir(os.path.join(projectFolderPath,"ltrPred"))
+    os.mkdir(os.path.join(projectFolderPath,"ltrHarvest"))
+    os.mkdir(os.path.join(projectFolderPath,"helitronScanner"))
+    os.mkdir(os.path.join(projectFolderPath,"helitronScanner_rc")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonPSI")) 
+    os.mkdir(os.path.join(projectFolderPath,"NCBICDD1000")) 
+    os.mkdir(os.path.join(projectFolderPath,"parsedAnnotations")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandA")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandB")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandC")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandD")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandE")) 
+    os.mkdir(os.path.join(projectFolderPath,"transposonCandF")) 
+    os.mkdir(os.path.join(projectFolderPath,"finalResults")) 
+    # Copy DNA into folder
+    copySequenceClean(inputFasta,projectFolderPath)
+
+#createProject("projects", "testProject", "G:/CambridgeGenData/GenSeq/RHIZIPHAGUS_IRR/rir17contigs.fasta")

+ 206 - 0
data/purposeCombined/Directory/setup.py

@@ -0,0 +1,206 @@
+import os
+from pathlib import Path
+import shutil
+import glob
+
+def setup_folders(num_vincs=6, num_sites=6):
+    """
+    DESCRIPTION:
+    Sets up directory structure for storing plotfiles.
+    
+    
+    CALLING SEQUENCE: 
+    setup_folders(num_vincs=6, num_sites=6)
+    
+    KEYWORDS:
+    ## num_vincs: number of velocity increments (default 6; +0-5 km/s)
+    ## num_sites: number of specific collision sites (default 6)
+    
+    
+    Directory Structure:
+    Plots
+        - all_ejecta
+            - vincs_separate
+                - 0vinc
+                    - all_planets
+                    - per_planet
+                        - cols_v_time
+                        - cols_v_time_fits
+                        - inc_v_a
+                        - e_v_a
+                - 1vinc
+                - 2vinc
+                  ...
+                  ...
+            - vincs_compared
+                - histograms
+                - cols_v_time
+                - inc_v_a
+                - e_v_a
+                
+        - specific_collision_sites
+            - site1
+                - vincs_separate
+                    - 0vinc
+                        - all_planets
+                        - per_planet
+                            - cols_v_time
+                            - cols_v_time_fits
+                            - inc_v_a
+                            - e_v_a
+                    - 1vinc
+                    - 2vinc
+                      ...
+                      ...
+                - vincs_compared
+                    - histograms
+                    - cols_v_time
+                    - inc_v_a
+                    - e_v_a
+            - site2
+              ...
+              ...
+              
+        - single_ejecta
+            - 0vinc
+            - 1vinc
+              ...
+              ...
+    
+    """
+    
+    object_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
+    
+    parent = os.getcwd()
+    plotspath = parent + "/Plots"
+    all_ejecta_path = plotspath + "/all_ejecta"
+    specific_sites_path = plotspath + "/specific_collision_sites"
+    single_ejecta_path = plotspath + "/single_ejecta"
+    
+    #create Plots directory
+    Path(plotspath).mkdir(parents=True, exist_ok=True)
+    
+    
+    
+    #create all_ejecta folder
+    Path(all_ejecta_path).mkdir(parents=True, exist_ok=True)
+     
+    #populate all_ejecta_folder:
+    
+    ###1. vincs_separate folder
+    Path(all_ejecta_path + "/vincs_separate").mkdir(parents=True, exist_ok=True)
+    for i in range(num_vincs):
+        
+        #make vincs_separate
+        vinc_folder = all_ejecta_path + "/vincs_separate/" + str(i) + "vinc"
+        Path(vinc_folder).mkdir(parents=True, exist_ok=True)
+        
+        #make all_planets
+        Path(vinc_folder + "/all_planets").mkdir(parents=True, exist_ok=True)
+        Path(vinc_folder + "/all_planets/inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(vinc_folder + "/all_planets/e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        
+        #make and populate per_planet
+        per_p_folder = vinc_folder + "/per_planet"
+        Path(per_p_folder).mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/cols_v_time").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/cols_v_time_fits").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/inc_v_a").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a").mkdir(parents=True, exist_ok=True)
+        for o in object_names[1:]:
+            Path(per_p_folder + "/inc_v_a/" + o + "_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/" + o + "_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        
+        Path(per_p_folder + "/inc_v_a/remaining_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a/remaining_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/inc_v_a/esc_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a/esc_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/inc_v_a/mixed_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        Path(per_p_folder + "/e_v_a/mixed_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+    
+    
+    ###2. vincs_compared folder
+    Path(all_ejecta_path + "/vincs_compared").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/histograms").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/cols_v_time").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/inc_v_a").mkdir(parents=True, exist_ok=True)
+    Path(all_ejecta_path + "/vincs_compared/e_v_a").mkdir(parents=True, exist_ok=True)
+    
+    
+    
+    #create specific_collision_sites folder
+    Path(specific_sites_path).mkdir(parents=True, exist_ok=True)
+    
+    #populate specific_collision_sites folder
+    for j in range(num_sites):
+        
+        #folder for each site
+        site_path = specific_sites_path + "/site" + str(j) 
+        Path(site_path).mkdir(parents=True, exist_ok=True)
+        
+        #1. vincs_separate folder
+        for i in range(num_vincs):
+        
+            #make vincs_separate
+            vinc_folder = site_path + "/vincs_separate/" + str(i) + "vinc"
+            Path(vinc_folder).mkdir(parents=True, exist_ok=True)
+
+            #make all_planets
+            Path(vinc_folder + "/all_planets").mkdir(parents=True, exist_ok=True)
+            Path(vinc_folder + "/all_planets/inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(vinc_folder + "/all_planets/e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            
+            #make and populate per_planet
+            per_p_folder = vinc_folder + "/per_planet"
+            Path(per_p_folder).mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/cols_v_time").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/cols_v_time_fits").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a").mkdir(parents=True, exist_ok=True)
+            for o in object_names[1:]:
+                Path(per_p_folder + "/inc_v_a/" + o + "_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+                Path(per_p_folder + "/e_v_a/" + o + "_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a/remaining_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/remaining_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a/esc_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/esc_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/inc_v_a/mixed_inc_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+            Path(per_p_folder + "/e_v_a/mixed_e_v_a_snapshots").mkdir(parents=True, exist_ok=True)
+        
+        ###2. vincs_compared folder
+        Path(site_path + "/vincs_compared").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/histograms").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/cols_v_time").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/inc_v_a").mkdir(parents=True, exist_ok=True)
+        Path(site_path + "/vincs_compared/e_v_a").mkdir(parents=True, exist_ok=True)
+        
+    
+    #create single_ejecta_path folder
+    Path(single_ejecta_path).mkdir(parents=True, exist_ok=True)
+    #populate
+    for i in range(num_vincs):
+        Path(single_ejecta_path + '/' + str(i) + 'vinc').mkdir(parents=True, exist_ok=True)
+        
+        
+def sort_data(num_vincs=6):
+    """
+    DESCRIPTION:
+    Sorts data folders in Ejecta_Simulation_Data by vinc.
+    
+    CALLING SEQUENCE:
+    sort_data(num_vincs=6)
+    
+    KEYWORDS:
+    ## num_vincs: number of velocity increments (default 6; +0-5 km/s)
+    """
+    
+    parent = os.getcwd()
+    folders = sorted(glob.glob(parent + '/Ejecta_Simulation_Data/5000e*'))
+    for i in range(num_vincs):
+        Path(parent + '/Ejecta_Simulation_Data/'+str(i)+'vinc').mkdir(parents=True, exist_ok=True)
+    for folder in folders:
+        vincnum = folder.split('/')[-1].split('_')[2][0]
+        shutil.move(folder, parent + '/Ejecta_Simulation_Data/' + str(vincnum) + 'vinc')
+    
+    
+    

+ 49 - 0
data/purposeCombined/Directory/split_data_in_k_folds.py

@@ -0,0 +1,49 @@
+import os
+import shutil
+
+
+def populate_kfold_directories(data_dir, K_FOLDS):
+
+    alarmed_images = os.listdir(f"{data_dir}/Alarmed")
+    annoyed_images = os.listdir(f"{data_dir}/Annoyed")
+    curious_images = os.listdir(f"{data_dir}/Curious")
+    relaxed_images = os.listdir(f"{data_dir}/Relaxed")
+
+    for i in range(K_FOLDS):
+        validation_range = (i*20, i*20 + 20)
+
+        for j in range(0, 100):
+            if validation_range[0] <= j < validation_range[1]:
+                shutil.copy(f"{data_dir}/Alarmed/{alarmed_images[j]}", f"folds/fold{i}/validation/Alarmed/")
+                shutil.copy(f"{data_dir}/Annoyed/{annoyed_images[j]}", f"folds/fold{i}/validation/Annoyed/")
+                shutil.copy(f"{data_dir}/Curious/{curious_images[j]}", f"folds/fold{i}/validation/Curious/")
+                shutil.copy(f"{data_dir}/Relaxed/{relaxed_images[j]}", f"folds/fold{i}/validation/Relaxed/")
+            else:
+                shutil.copy(f"{data_dir}/Alarmed/{alarmed_images[j]}", f"folds/fold{i}/train/Alarmed/")
+                shutil.copy(f"{data_dir}/Annoyed/{annoyed_images[j]}", f"folds/fold{i}/train/Annoyed/")
+                shutil.copy(f"{data_dir}/Curious/{curious_images[j]}", f"folds/fold{i}/train/Curious/")
+                shutil.copy(f"{data_dir}/Relaxed/{relaxed_images[j]}", f"folds/fold{i}/train/Relaxed/")
+
+
+def create_kfold_directories(K_FOLDS):
+
+    try:
+        os.mkdir("folds")
+    except:
+        print("Directory 'folds' already exists")
+
+    for i in range(K_FOLDS):
+        try:
+            os.mkdir(f"folds/fold{i}/")
+            os.mkdir(f"folds/fold{i}/train")
+            os.mkdir(f"folds/fold{i}/validation")
+            os.mkdir(f"folds/fold{i}/train/Alarmed")
+            os.mkdir(f"folds/fold{i}/train/Annoyed")
+            os.mkdir(f"folds/fold{i}/train/Curious")
+            os.mkdir(f"folds/fold{i}/train/Relaxed")
+            os.mkdir(f"folds/fold{i}/validation/Alarmed")
+            os.mkdir(f"folds/fold{i}/validation/Annoyed")
+            os.mkdir(f"folds/fold{i}/validation/Curious")
+            os.mkdir(f"folds/fold{i}/validation/Relaxed")
+        except:
+            print("Can't create directory because it already exists")

+ 80 - 0
data/purposeCombined/Directory/stc_vid2frames.py

@@ -0,0 +1,80 @@
+import sys
+import os
+import numpy as np
+import shutil
+import argparse
+import torch
+import torchvision
+from tqdm import tqdm
+
+def main():
+    parser = argparse.ArgumentParser(add_help=True)
+    parser.add_argument('--dataroot',
+                        default='.',
+                        help='Dataset root directory')
+    parser.add_argument('--src_vid_path', default='archive/training/videos/',
+                        help='Name of folder where `avi` files exist')
+    parser.add_argument('--tar_vid_frame_path', default='converted/train',
+                        help='Name of folder to save extracted frames.')
+    parser.add_argument('--src_npy_path', default='archive/test_pixel_mask/',
+                        help='Name of folder where `npy` frame mask exist')
+    parser.add_argument('--tar_anno_path', default='converted/pixel_mask',
+                        help='Name of folder to save extracted frame annotation')
+    parser.add_argument('--extension', default='jpg',
+                        help="File extension format for the output image")
+
+    args = parser.parse_args()
+
+    src_dir = os.path.join(args.dataroot, args.src_vid_path)
+    tar_dir = os.path.join(args.dataroot, args.tar_vid_frame_path)
+
+    try:
+        os.makedirs(tar_dir)
+    except FileExistsError:
+        print(F'{tar_dir} already exists, remove whole tree and recompose ...')
+        shutil.rmtree(tar_dir)
+        os.makedirs(tar_dir)
+
+    vid_list = os.listdir(src_dir)
+
+    for i, vidname in enumerate(tqdm(vid_list)):
+        vid = torchvision.io.read_video(os.path.join(src_dir, vidname), pts_unit='sec')[0]
+        target_folder = os.path.join(tar_dir, vidname[:-4])
+   
+        try: 
+            os.makedirs(target_folder)
+        except FileExistsError:
+            print(F'{target_folder} already exists, remove the directory recompose ...')
+            shutil.rmtree(target_folder)
+            os.makedirs(target_folder) 
+            
+        for i, frame in enumerate(vid):
+            frame = (frame / 255.).permute(2, 0, 1) #HWC2CHW
+            torchvision.utils.save_image(frame,
+                                         F'{target_folder}/{i:03}.{args.extension}') 
+    
+    src_dir = os.path.join(args.dataroot, args.src_npy_path)    
+    tar_dir = os.path.join(args.dataroot, args.tar_anno_path)
+
+    try:
+        os.makedirs(tar_dir)
+    except FileExistsError:
+        print(F"{tar_dir} already exists, remove whole tree and recompose ...")
+        shutil.rmtree(tar_dir)
+        os.makedirs(tar_dir)
+
+    frame_anno = os.listdir(src_dir)
+
+    for _f in tqdm(frame_anno):
+        fn = _f[:-4]
+        target_folder = os.path.join(tar_dir, fn)
+        os.makedirs(target_folder)
+        px_anno = np.load(F"{src_dir}/{fn}.npy").astype(np.float)
+
+        for i, px_frame in enumerate(px_anno):
+            torchvision.utils.save_image(torch.from_numpy(px_frame).unsqueeze(0), # CHW, 1 channel
+                                         F"{target_folder}/{i:03}.{args.extension}")
+
+
+if __name__ == '__main__':
+    main()

+ 197 - 0
data/purposeCombined/Directory/test_archive.py

@@ -0,0 +1,197 @@
+## Copyright (c) 2012 Aldebaran Robotics. All rights reserved.
+## Use of this source code is governed by a BSD-style license that can be
+## found in the COPYING file.
+
+"""Automatic testing for handling archives
+
+"""
+
+import os
+import sys
+import stat
+import errno
+import unittest
+import tempfile
+
+import qibuild
+
+class ArchiveTestCase(unittest.TestCase):
+    def setUp(self):
+        self.tmp = tempfile.mkdtemp(prefix="tmp-archive-test")
+
+    def tearDown(self):
+        qibuild.sh.rm(self.tmp)
+
+    def test_zip_extract(self):
+        # Create some files in the temp dir:
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        # Create a empty dir called a, and two files named
+        # b and c
+        a = os.path.join(src, "a")
+        os.mkdir(a)
+        b = os.path.join(a, "b")
+        with open(b, "w") as fp:
+            fp.write("b\n")
+        c = os.path.join(a, "c")
+        with open(c, "w") as fp:
+            fp.write("c\n")
+        archive = qibuild.archive.zip(a)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(archive, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["a/b", "a/c"])
+
+    def test_zip_extract_ro(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        # Create a empty dir called a, and two files named
+        # b and c
+        a = os.path.join(src, "a")
+        os.mkdir(a)
+        ro = os.path.join(a, "ro")
+        with open(ro, "w") as fp:
+            fp.write("ro\n")
+        # 200:
+        os.chmod(ro, stat.S_IRUSR)
+        archive = qibuild.archive.zip(a)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(archive, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["a/ro"])
+        dest_ro = os.path.join(dest, "a", "ro")
+        # check that the dest is readonly:
+        error = None
+        try:
+            open(dest_ro, "w")
+        except IOError as e:
+            error = e
+        self.assertFalse(error is None)
+        self.assertEquals(error.errno,  errno.EACCES)
+
+    def test_zip_extract_ro_dir(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        ro1 = os.path.join(src, "ro1")
+        os.mkdir(ro1)
+        ro2 = os.path.join(ro1, "ro2")
+        os.mkdir(ro2)
+        a = os.path.join(ro2, "a")
+        with open(a, "w") as fp:
+            fp.write("a\n")
+        # RO dir inside an other RO dir
+        os.chmod(ro2, stat.S_IRUSR | stat.S_IXUSR)
+        os.chmod(ro1, stat.S_IRUSR | stat.S_IXUSR)
+        archive = qibuild.archive.zip(src)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(archive, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["src/ro1/ro2/a"])
+
+    def test_extract_preserve_executables_from_zip(self):
+        zip = qibuild.command.find_program("zip")
+        if not zip:
+            return
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_exe = os.path.join(src, "a.exe")
+        with open(a_exe, "w") as fp:
+            fp.write("a_exe\n")
+        st_700 = stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR
+        os.chmod(a_exe, st_700)
+        qibuild.command.call(["zip", "-r", "src.zip", "src"],
+            cwd=self.tmp)
+        archive = os.path.join(self.tmp, "src.zip")
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract_zip(archive, dest)
+        dest_exe = os.path.join(dest, "src", "a.exe")
+        st_mode = os.stat(dest_exe).st_mode
+        self.assertEquals(st_mode, 100700)
+
+    def test_extract_change_topdir(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_long_dir = os.path.join(src, "a_long_dir")
+        os.mkdir(a_long_dir)
+        b = os.path.join(a_long_dir, "b")
+        with open(b, "w") as fp:
+            fp.write("b\n")
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        tar_gz = qibuild.archive.zip_unix(a_long_dir)
+        qibuild.archive.extract(tar_gz, dest, topdir="a")
+        a = os.path.join(dest, "a")
+        ls_r = qibuild.sh.ls_r(a)
+        self.assertEquals(ls_r, ["b"])
+        a_zip = qibuild.archive.zip_win(a_long_dir)
+        qibuild.archive.extract(a_zip, dest, topdir="aa")
+        aa = os.path.join(dest, "aa")
+        ls_r = qibuild.sh.ls_r(aa)
+        self.assertEquals(ls_r, ["b"])
+
+    def test_extract_change_topdir_already_correct(self):
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_dir = os.path.join(src, "a")
+        os.mkdir(a_dir)
+        tar_gz = qibuild.archive.zip_unix(a_dir)
+        dest = os.path.join(self.tmp, "dest")
+        qibuild.archive.extract(tar_gz, dest, topdir="a")
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r, ["a/"])
+
+    def test_extract_with_symlink(self):
+        if sys.platform.startswith("win"):
+            return
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_dir = os.path.join(src, "a_dir")
+        os.mkdir(a_dir)
+        a_file = os.path.join(a_dir, "a_file")
+        with open(a_file, "w") as fp:
+            fp.write("a_file\n")
+        a_link = os.path.join(a_dir, "a_link")
+        os.symlink("a_file", a_link)
+        tar_gz = qibuild.archive.zip_unix(a_dir)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(tar_gz, dest)
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r,
+            ['a_dir/a_file', 'a_dir/a_link'])
+        dest_link = os.path.join(dest, "a_dir", "a_link")
+        self.assertTrue(os.path.islink(dest_link))
+        dest_target = os.readlink(dest_link)
+        self.assertEquals(dest_target, "a_file")
+
+    def test_extract_with_symlink_and_change_topdir(self):
+        if sys.platform.startswith("win"):
+            return
+        src = os.path.join(self.tmp, "src")
+        os.mkdir(src)
+        a_long_dir = os.path.join(src, "a_long_dir")
+        os.mkdir(a_long_dir)
+        a_file = os.path.join(a_long_dir, "a_file")
+        with open(a_file, "w") as fp:
+            fp.write("a_file\n")
+        a_link = os.path.join(a_long_dir, "a_link")
+        os.symlink("a_file", a_link)
+        tar_gz = qibuild.archive.zip_unix(a_long_dir)
+        dest = os.path.join(self.tmp, "dest")
+        os.mkdir(dest)
+        qibuild.archive.extract(tar_gz, dest, topdir="a_dir")
+        ls_r = qibuild.sh.ls_r(dest)
+        self.assertEquals(ls_r,
+            ['a_dir/a_file', 'a_dir/a_link'])
+        dest_link = os.path.join(dest, "a_dir", "a_link")
+        self.assertTrue(os.path.islink(dest_link))
+        dest_target = os.readlink(dest_link)
+        self.assertEquals(dest_target, "a_file")
+
+
+if __name__ == "__main__":
+    unittest.main() 

+ 306 - 0
data/purposeCombined/Directory/test_tool.py

@@ -0,0 +1,306 @@
+import unittest
+from unittest.mock import patch
+import os
+import shutil
+from programy.admin.tool import AdminTool
+
+
+class MockAdminTool(AdminTool):
+
+    def __init__(self):
+        AdminTool.__init__(self)
+        self.text = ""
+
+    def display(self, text):
+        self.text += text
+
+
+class AdminToolTests(unittest.TestCase):
+
+    def get_temp_dir(self):
+        if os.name == 'posix':
+            return '/tmp'
+        elif os.name == 'nt':
+            import tempfile
+            return tempfile.gettempdir()
+        else:
+            raise Exception("Unknown operating system [%s]" % os.name)
+
+    def create_file(self, filename):
+        with open(filename, "w+") as file:
+            file.writelines(["line1", "line2", "line3"])
+            file.flush()
+            file.close()
+
+    def test_recursive_copy(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        src_dir =  tmp_dir + os.sep + "src"
+        os.mkdir(src_dir)
+        src_sub_dir = tmp_dir + os.sep + "src" + os.sep + "sub"
+        os.mkdir(src_sub_dir)
+        src_sub_dir2 = tmp_dir + os.sep + "src" + os.sep + "sub2"
+        os.mkdir(src_sub_dir2)
+        dest_dir = tmp_dir + os.sep + "dest"
+        os.mkdir(dest_dir)
+
+        self.create_file(src_dir + os.sep + "file1.txt")
+        self.create_file(src_dir + os.sep + "file2.txt")
+        self.create_file(src_dir + os.sep + "file3.txt")
+        self.create_file(src_dir + os.sep + "sub" + os.sep + "file4.txt")
+
+        AdminTool.recursive_copy(src_dir, dest_dir)
+
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file1.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file2.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file3.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "sub" + os.sep + "file4.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "sub2"))
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_delete_folder_contents(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        src_dir =  tmp_dir + os.sep + "src"
+        os.mkdir(src_dir)
+        src_sub_dir = tmp_dir + os.sep + "src" + os.sep + "sub"
+        os.mkdir(src_sub_dir)
+        dest_dir = tmp_dir + os.sep + "dest"
+        os.mkdir(dest_dir)
+
+        self.create_file(src_dir + os.sep + "file1.txt")
+        self.create_file(src_dir + os.sep + "file2.txt")
+        self.create_file(src_dir + os.sep + "file3.txt")
+        self.create_file(src_dir + os.sep + "sub" + os.sep + "file4.txt")
+
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file1.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file2.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "file3.txt"))
+        self.assertTrue(os.path.exists(src_dir + os.sep + "sub" + os.sep + "file4.txt"))
+
+        AdminTool.delete_folder_contents(tmp_dir)
+
+        self.assertFalse(os.path.exists(src_dir + os.sep + "file1.txt"))
+        self.assertFalse(os.path.exists(src_dir + os.sep + "file2.txt"))
+        self.assertFalse(os.path.exists(src_dir + os.sep + "file3.txt"))
+        self.assertFalse(os.path.exists(src_dir + os.sep + "sub" + os.sep + "file4.txt"))
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_make_executable(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        filepath = tmp_dir + os.sep + "file1.txt"
+        self.create_file(filepath)
+
+        self.assertTrue(os.path.exists(filepath))
+
+        AdminTool.make_executable(filepath)
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_make_all_executable(self):
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+        os.mkdir(tmp_dir)
+
+        src_dir =  tmp_dir + os.sep + "src"
+        os.mkdir(src_dir)
+        src_sub_dir = tmp_dir + os.sep + "src" + os.sep + "sub"
+        os.mkdir(src_sub_dir)
+        dest_dir = tmp_dir + os.sep + "dest"
+        os.mkdir(dest_dir)
+
+        self.create_file(src_dir + os.sep + "file1.txt")
+        self.create_file(src_dir + os.sep + "file2.txt")
+        self.create_file(src_dir + os.sep + "file3.txt")
+        self.create_file(src_dir + os.sep + "sub" + os.sep + "file4.txt")
+
+        AdminTool.make_all_executable(tmp_dir)
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def test_list_bots(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        tool.list_bots()
+
+        self.assertEquals("""Available bots are:
+	alice2-y	professor-y	rosie-y	talk-y	y-bot	servusai-y	template-y	traintimes-y
+	To download use 'python3 -m programy.admin.tool download <bot-name>'
+Additional components are:
+	textblob
+	To install use 'python3 -m programy.admin.tool install <component>'""", tool.text)
+
+    def patch_wget_download(self, url):
+        return "mock.bot"
+
+    @patch("programy.admin.tool.AdminTool.wget_download", patch_wget_download)
+    def test_download_bot(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        filename = tool.download_bot("y-bot")
+        self.assertEqual("mock.bot", filename)
+
+        self.assertEqual("""Downloading [y-bot] from [https://github.com/keiffster/y-bot/archive/master.zip]
+Download complete""", tool.text)
+
+    def test_zip_dir_name_from_filename(self):
+        self.assertEqual("filename", AdminTool.zip_dir_name_from_filename('filename.zip'))
+        self.assertEqual("filename", AdminTool.zip_dir_name_from_filename('filename'))
+
+    def test_extract_bot_no_remove(self):
+        tool = AdminTool()
+
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        os.mkdir(tmp_dir)
+        shutil.copyfile(os.path.dirname(__file__) + os.sep + "bot.zip", tmp_dir + os.sep + "bot.zip")
+
+        tool.extract_bot(tmp_dir + os.sep + "bot.zip", path=tmp_dir, remove_after=False)
+
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "bot.zip"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test1.txt"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test2.txt"))
+
+        shutil.rmtree(tmp_dir)
+
+    def test_extract_bot_with_remove(self):
+        tool = AdminTool()
+
+        tmp_dir = self.get_temp_dir() + os.sep +"programy"
+
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        os.mkdir(tmp_dir)
+        shutil.copyfile(os.path.dirname(__file__) + os.sep + "bot.zip", tmp_dir + os.sep + "bot.zip")
+
+        tool.extract_bot(tmp_dir + os.sep + "bot.zip", path=tmp_dir, remove_after=True)
+
+        self.assertFalse(os.path.exists(tmp_dir + os.sep + "bot.zip"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test1.txt"))
+        self.assertTrue(os.path.exists(tmp_dir + os.sep + "test2.txt"))
+
+        shutil.rmtree(tmp_dir)
+
+    def patch_download_and_make_active(self, bot_name):
+        pass # Do nothing
+
+    @patch("programy.admin.tool.AdminTool.download_and_make_active", patch_download_and_make_active)
+    def test_install_bot(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        tool.install_bot(["test", "y-bot"])
+        self.assertEqual("""
+To run y-bot bot in console mode, use the following commands
+\tcd scripts/xnix\t./y-bot.sh""", tool.text)
+
+    def test_install_bot_unknown(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        with self.assertRaises(Exception):
+            tool.install_bot(["test", "unknown"])
+
+    def patch_install_textblob(self):
+        pass # Do nothing
+
+    @patch("programy.admin.tool.AdminTool.install_textblob", patch_install_textblob)
+    def test_install_additional(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        tool.install_additional(["test", "textblob"])
+        self.assertEqual("Installing additional components for textblob", tool.text)
+
+    def test_install_additional_invalid(self):
+        tool = MockAdminTool()
+        self.assertEquals("", tool.text)
+
+        with self.assertRaises(Exception):
+            tool.install_additional(["test", "xxxxxxx"])
+
+    def test_show_execute_help(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        tool.show_execute_help("y-bot")
+
+        self.assertEqual("""
+To run y-bot bot in console mode, use the following commands
+\tcd scripts/xnix\t./y-bot.sh""", tool.text)
+
+    def test_show_help(self):
+        tool = MockAdminTool()
+        self.assertEqual("", tool.text)
+
+        tool.show_help()
+
+        self.assertEqual("""Available commands are:
+\thelp	list	download <bot-name>	install <component>""", tool.text)
+
+    def test_run_no_words(self):
+        tool = MockAdminTool()
+        tool.run([])
+        self.assertIsNotNone(tool.text)
+        self.assertTrue(tool.text.startswith("Available commands are:"))
+
+    def test_run_unknown_primary_command(self):
+        tool = MockAdminTool()
+        tool.run(['unknown'])
+        self.assertIsNotNone(tool.text)
+        self.assertTrue(tool.text.startswith("Unknown primary command [unknown]"))
+
+    def test_run_missing_bot_name(self):
+        tool = MockAdminTool()
+        tool.run(['download'])
+        self.assertIsNotNone(tool.text)
+        self.assertTrue(tool.text.startswith("Missing bot name from download command"))
+
+    def test_run_list(self):
+        tool = MockAdminTool()
+        tool.run(['list'])
+        self.assertIsNotNone(tool.text)
+
+    def test_run_download(self):
+        tool = MockAdminTool()
+        tool.run(['download'])
+        self.assertIsNotNone(tool.text)
+
+    def test_run_install(self):
+        tool = MockAdminTool()
+        tool.run(['install'])
+        self.assertIsNotNone(tool.text)
+
+    def test_run_help(self):
+        tool = MockAdminTool()
+        tool.run(['help'])
+        self.assertIsNotNone(tool.text)

+ 272 - 0
data/purposeCombined/Directory/tutorial.py

@@ -0,0 +1,272 @@
+import csv
+import os
+import re
+import shutil
+
+def del_create_analytics_folder():
+    # del the analytics folder including subfolder
+    # mkdir the analytics folder (only mkdir)
+    if os.path.exists('analytics'):
+        shutil.rmtree('analytics')
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+
+def course():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/course'):
+        shutil.rmtree('analytics/course')
+    d = {'01':'btech',
+    '11':'mtech',
+    '21':'phd',
+    '12':'msc'}
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/course'):
+            os.mkdir('analytics/course')
+        for row in reader:
+            if len(row)==0:
+                print(1)
+                continue
+            l = list(row.values())
+            head = list(row.keys())
+            stream = str(row['id'][-4:-2]).lower()
+            yr = str(row['id'][:2])
+            if str(row['id'][2:4]) in list(d.keys()):
+                degree = d[str(row['id'][2:4])]
+            else:
+                with open('analytics/course/' + 'misc.csv' , mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/course/' + 'misc.csv')==0:
+                        f_write.writerow(head)
+                    f_write.writerow(l)
+                f.close()
+                continue
+            csv_name = f'{yr}_{stream}_{degree}.csv'
+            p = re.compile(r'\d\d\d\d\D\D\d\d')
+            k = re.fullmatch(p,row['id'])
+            if k:
+                if not os.path.exists('analytics/course/'+ stream):
+                    os.mkdir('analytics/course/'+ stream) 
+                if not os.path.exists('analytics/course/'+ stream + '/' + degree):
+                    os.mkdir('analytics/course/'+ stream + '/' + degree ) 
+                with open('analytics/course/'+ stream + '/' + degree + '/' + csv_name , mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/course/'+ stream + '/' + degree + '/' + csv_name)==0:
+                        f_write.writerow(head)
+                    f_write.writerow(l)
+                f.close()
+            else:
+                with open('analytics/course/' + 'misc.csv' , mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/course/' + 'misc.csv')==0:
+                        f_write.writerow(head)
+                    f_write.writerow(l)
+                f.close()
+    csvfile.close()
+
+
+def country():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')    
+    if os.path.exists('analytics/country'):
+        shutil.rmtree('analytics/country')    
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/country'):
+            os.mkdir('analytics/country')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            with open('analytics/country/'+row['country'].lower()+ '.csv', mode = 'a') as f:
+                f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                if os.path.getsize('analytics/country/'+row['country'].lower() + '.csv')==0:
+                  f_write.writerow(head) 
+                f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+
+
+def email_domain_extract():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/email'):
+        shutil.rmtree('analytics/email')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/email'):
+            os.mkdir('analytics/email')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            if '@' in row['email'] and '.' in row['email']:
+                domain = row['email'].split('@')[1].split('.')[0]
+                with open('analytics/email/'+domain+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/email/'+ domain + '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+                f.close()
+
+            else:
+                with open('analytics/email/'+'misc'+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/email/'+ domain + '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+                f.close()
+    csvfile.close()
+
+
+
+
+
+def gender():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/gender'):
+        shutil.rmtree('analytics/gender')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/gender'):
+            os.mkdir('analytics/gender')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            gender = row['gender'].lower()
+            with open('analytics/gender/'+gender+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/gender/'+ gender + '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+def dob():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/dob'):
+        shutil.rmtree('analytics/dob')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/dob'):
+            os.mkdir('analytics/dob')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            x = str(re.sub(r"\D","-",row['dob']))
+            yr = int(x.split('-')[-1])
+            k = int(yr)%10
+            if k>4:
+                name = 'bday_' + str(yr - k + 5) + '_' + str(yr - k + 9)
+            else:
+                name = 'bday_' + str(yr - k ) + '_' + str(yr - k + 4)
+            if yr > 2014:
+                name = 'bday_2015_2020'
+            with open('analytics/dob/'+name+ '.csv', mode = 'a') as f:
+                    f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                    if os.path.getsize('analytics/dob/'+name+ '.csv')==0:
+                        f_write.writerow(head) 
+                    f_write.writerow(l)
+            f.close()
+        
+
+
+
+def state():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/state'):
+        shutil.rmtree('analytics/state')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/state'):
+            os.mkdir('analytics/state')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            with open('analytics/state/'+row['state'].lower()+ '.csv', mode = 'a') as f:
+                f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                if os.path.getsize('analytics/state/'+row['state'].lower() + '.csv')==0:
+                  f_write.writerow(head) 
+                f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+def blood_group():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    if os.path.exists('analytics/blood_group'):
+        shutil.rmtree('analytics/blood_group')
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        if not os.path.exists('analytics/blood_group'):
+            os.mkdir('analytics/blood_group')
+        for row in reader:
+            l = list(row.values())
+            head = list(row.keys())
+            with open('analytics/blood_group/'+row['blood_group']+ '.csv', mode = 'a') as f:
+                f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+                if os.path.getsize('analytics/blood_group/'+row['blood_group'] + '.csv')==0:
+                    f_write.writerow(head) 
+                f_write.writerow(l)
+            f.close()
+    csvfile.close()
+
+
+# Create the new file here and also sort it in this function only.
+def new_file_sort():
+    if not os.path.exists('analytics'):
+        os.mkdir('analytics')
+    new = []
+    head = []
+    with open('studentinfo_cs384.csv', newline='') as csvfile:
+        reader = csv.DictReader(csvfile)       
+        for row in reader:
+            head = list(row.keys())
+            del head[1]
+            head.insert(1,'first_name')
+            head.insert(2,'last_name')
+            k = list(row.values())
+            del k[1]
+            k.insert(1,row['full_name'].split()[0])
+            k.insert(2,' '.join(row['full_name'].split()[1:]))
+            new.append(k)
+    csvfile.close()
+    with open('analytics/studentinfo_cs384_names_split.csv', newline='',mode='w') as f:
+        f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+        f_write.writerow(head)
+        for i in new:
+            f_write.writerow(i)
+    f.close()
+    #sorting
+    dic = {}
+    for i in new:
+        dic[i[1]]='#$%^&*'.join(i)
+    new = []
+    with open('analytics/studentinfo_cs384_names_split_sorted_first_name.csv', mode = 'w') as f:
+        print
+    f.close()
+    for i in sorted(dic.items()):
+        new.append(i[1].split('#$%^&*'))
+    with open('analytics/studentinfo_cs384_names_split_sorted_first_name.csv', mode = 'a') as f:
+        f_write = csv.writer(f, delimiter=',',lineterminator='\r')
+        f_write.writerow(head)
+        for i in new:
+            f_write.writerow(i)
+    f.close()
+
+#if __name__ == "__main__":
+#     del_create_analytics_folder()
+#     course()
+#     blood_group()
+#     new_file_sort()s
+#     state()
+#     email_domain_extract()
+#     state()
+#     gender()
+ #   dob()

Неке датотеке нису приказане због велике количине промена