LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
							# Customary Imports:
import tensorflow as tf
assert '2.' in tf.__version__  # make sure you're using tf 2.0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import skimage
import cv2 as cv
import os
import datetime
import scipy
from skimage.morphology import reconstruction
from skimage import exposure
import scipy.io as sio
import h5py
import random
import shutil
import PIL
import imageio
import pydot 
import graphviz
import plotly.graph_objects as go
import preprocess_crop
from pathlib import Path
from tensorflow.keras import backend as K
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
#from keras.utils import CustomObjectScope
from mpl_toolkits.mplot3d import Axes3D
import data_preprocessing_utils
##################################################################################################################################
'''
DATA PREPROCESSING UTILS:
'''
##################################################################################################################################
# Converting MAP Files:
def convert_MAP(directory, output_directory, min_shape, file_format = '.npy', search_keys = None, dtype = np.float32):
    '''
    This program loops through given raw_data directory
    and converts .mat files to .npy files
    '''
    new_dir = os.path.join(os.getcwd(), output_directory)
    if not os.path.exists(new_dir):
        os.mkdir(new_dir)
    else:
        shutil.rmtree(new_dir)
        os.mkdir(new_dir)
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".mat"): 
            #print(os.path.join(directory, filename))
            filepath = os.path.join(directory, filename)
            array_dict = {}
            try:
                f = h5py.File(filepath, 'r')
            except:
                f = sio.loadmat(filepath)
            for k, v in f.items():
                array_dict[k] = np.array(v, dtype = np.float32)
            # As we only need image info from dict (the last key) we do this
            if search_keys == None:
                search_keys = 'map' # out of struct of .mat files want "map"
                filtered_dict = dict(filter(lambda item: search_keys in item[0], array_dict.items()))
            else:
                filtered_dict = {}
                for i in range(len(search_keys)):
                    search_key = search_keys[i]
                    if search_key in array_dict:
                        filtered_dict[search_key] = array_dict[search_key]
            if len(filtered_dict) == 0:
                print('No Data to Meet Search Key Requirements: Datapoint Rejected -> ' + filepath)
            else:
                #print(list(array_dict.keys()))
                #print(filtered_dict)
                arrays = []
                for k, v in filtered_dict.items():
                    temp = np.transpose(v.astype(np.float32))
                    # To normalize data between [-1,1], use -> arrays = arrays/(np.max(arrays)/2) - 1
                    # To normalize data between [0,1], use -> arrays = arrays/(np.max(arrays))
                    # To normalize data between [0,255], 
                    #     use -> arrays = (arrays/(np.max(arrays))*255).astype(np.uint8)
                    temp = temp/(np.max(temp))
                    arrays.append(temp)
                for i in range(len(arrays)):
                    if len(arrays[i].shape) > 2:
                        #print(arrays[i].shape)
                        arrays[i] = np.mean(arrays[i], axis = 2)

                for i in range(len(arrays)):
                    new_dir_filepath = os.path.join(new_dir, filename.strip('.mat') 
                                                    + '_index'+str(i) + file_format)
                    array = arrays[i]
                    if array.shape[0] >= min_shape[0] and array.shape[1] >= min_shape[1]:
                        if file_format == '.npy':
                            np.save(new_dir_filepath, array, allow_pickle=True, fix_imports=True)
                        else:
                            imageio.imwrite(new_dir_filepath, array)
                    elif i == 0:
                        print('Min Size Not Met: Datapoint Rejected -> ' + filepath)
    return os.path.join(os.getcwd(), output_directory)

##################################################################################################################################
# Data Cleaning Procedures:
def data_clean_func(image = None):
    if image is not None:
        #print(len(np.unique(image)))
        #clean_image = image
        '''
        plt.hist(image)
        plt.show()
        '''
        '''
        plt.imshow(image, cmap='gray')
        plt.title('Original Image')
        plt.show()
        '''
        threshold = 0.85
        default_fill = 0.0
        frac_of_high_clip = 1/9
        image[image > threshold] = default_fill
        image[image < frac_of_high_clip*(1.0-threshold)] = default_fill
        '''
        plt.imshow(image, cmap='gray')
        plt.title('After Clipping')
        plt.show()
        '''
        image = scipy.ndimage.median_filter(image, size=(4, 4))
        '''
        plt.imshow(image, cmap='gray')
        plt.title('After Median Filter')
        plt.show()
        '''
        image = skimage.filters.gaussian(image, sigma=0.01, output=None, mode='reflect', preserve_range=True)
        ####################################################################
        # Added to ensure negligible loss when converting to int16 
        # within exposure.equalize_adapthist
        image = (image/np.max(image)*(2**16)).astype(np.uint16)
        # A "Monkey Patch" could possibly be used as a cleaner solution, 
        # but would be more involved than is necessary for my application
        ####################################################################
        image = exposure.equalize_adapthist(image,kernel_size=image.shape[0]//8, clip_limit=0.005, nbins=2**13)
        image = image.astype(np.float64)
        '''
        plt.imshow(image, cmap='gray')
        plt.title('After Local Adapt Hist')
        plt.show()
        '''
        image = scipy.ndimage.median_filter(image, size=(3, 1))
        image = scipy.ndimage.median_filter(image, size=(1, 3))
        image = skimage.filters.gaussian(image, sigma=0.1, output=None, mode='reflect', preserve_range=True)
        image = exposure.rescale_intensity(image, in_range='image', out_range=(0.0,1.0))
        '''
        plt.imshow(image, cmap='gray')
        plt.title('Final Image')
        plt.show()
        '''
        '''
        plt.hist(image)
        plt.show()
        '''
        clean_image = image.astype(np.float32)
    else:
        clean_image = image
    return clean_image

def data_cleaning(input_dir = 'converted_data', output_dir_name = 'cleaned_data',
                  output_file_format ='.npy', delete_previous = True):
    '''
     This program seeks to remove some noise from the data
     and make the underlying vessel structure more prominent
     Input: input_dir -> directory that holds data to be cleaned
            output_dir -> directory to hold cleaned data
     Output: None
    '''
    file_list = os.listdir(input_dir)
    clean_dir = os.path.join(os.getcwd(), output_dir_name)
    if not os.path.exists(clean_dir):
        os.mkdir(clean_dir)
    elif delete_previous == True:
        shutil.rmtree(clean_dir)
        os.mkdir(clean_dir)
    for file in file_list:
        filename = os.fsdecode(file)
        filepath = os.path.join(input_dir, filename)
        if filepath.endswith('.npy'):
            array = np.load(filepath)
        else:
            array = imageio.imread(filepath)
            
        # Defined data clean function above:
        array = data_preprocessing_utils.data_clean_func(array)
    
        new_filepath = os.path.join(clean_dir, filename)
        if output_file_format == '.npy':
            new_filepath = Path(new_filepath)
            new_filepath = new_filepath.with_suffix('')
            new_filepath = new_filepath.with_suffix(output_file_format)
            np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
        else:
            new_filepath = Path(new_filepath)
            new_filepath = new_filepath.with_suffix('')
            new_filepath = new_filepath.with_suffix(output_file_format)
            imageio.imwrite(new_filepath, array)
    return  

    
##################################################################################################################################
# Data Seperation / Validation Split Procedures:
def data_seperation(input_dir, dataset_percentages, 
                    delete_previous = False, file_format = '.npy', 
                    scale = 1):
    '''
    Takes numpy array and creates data folder with seperate sections
    for training, validation, and testing according to given percentages
    Input: numpy dir -> contains file path to data folder of numpy files
           dataset_percentages -> (% train, % test) such that % train + % test = 100
           OR
           dataset_percentages -> (% train, % val, % test) such that % train + % val + % test = 100
    Output: new folders for training and testing or training/validation/testing
    '''
    
    # If just train and test
    if len(dataset_percentages) == 2:
        # Making Main data folder
        new_dir = os.path.join(os.getcwd(), 'data')
        if not os.path.exists(new_dir):
            os.mkdir(new_dir)
        
        # Making train subfolder
        train_dir = os.path.join(new_dir, 'train')
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)
            train_dir = os.path.join(train_dir, 'input')
            os.mkdir(train_dir)
        elif delete_previous == True:
            shutil.rmtree(train_dir)
            os.mkdir(train_dir)
            train_dir = os.path.join(train_dir, 'input')
            os.mkdir(train_dir)
        
        # Making test subfolder
        test_dir = os.path.join(new_dir, 'test')
        if not os.path.exists(test_dir):
            os.mkdir(test_dir)
            test_dir = os.path.join(test_dir, 'input')
            os.mkdir(test_dir)
        elif delete_previous == True:
            shutil.rmtree(test_dir)
            os.mkdir(test_dir)
            test_dir = os.path.join(test_dir, 'input')
            os.mkdir(test_dir)


        file_list = os.listdir(input_dir)
        total_num_imgs = len(file_list)
        train_percent = dataset_percentages[0]
        test_percent = dataset_percentages[1]
        valid_inputs = (train_percent >= test_percent and train_percent <= 100 and
                        test_percent <= 100 and train_percent > 0 and test_percent > 0 and
                        train_percent + test_percent == 100)
        if valid_inputs:
            num_train = int(round(total_num_imgs * train_percent//100))
        else:
            num_train = int(round(total_num_imgs * 0.9))
            print('ERROR: Please input valid percentages for dataset division')
            print('In place of valid input the ratio 90% train, 10% test was used')
        
        index = 0
        random.shuffle(file_list)
        for file in file_list:
            filename = os.fsdecode(file)
            filepath = os.path.join(input_dir, filename)
            # Loads File
            if filepath.endswith('.npy'):
                array = np.load(filepath)
                array = array/np.max(array)*scale
            else:
                array = imageio.imread(filepath)
                array = array/np.max(array)*scale
            if index < num_train:
                new_filepath = os.path.join(train_dir, filename)
            else:
                new_filepath = os.path.join(test_dir, filename)
            # Saves File
            if file_format == '.npy':
                new_filepath = Path(new_filepath)
                new_filepath = new_filepath.with_suffix('')
                new_filepath = new_filepath.with_suffix(file_format)
                np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
            else:
                new_filepath = Path(new_filepath)
                new_filepath = new_filepath.with_suffix('')
                new_filepath = new_filepath.with_suffix(file_format)
                imageio.imwrite(new_filepath, array)
            index += 1
        return train_dir, test_dir
    # If train, val, and test
    elif len(dataset_percentages) == 3:
        # Making Main data folder
        new_dir = os.path.join(os.getcwd(), 'data')
        if not os.path.exists(new_dir):
            os.mkdir(new_dir)
            
        # Making train subfolder
        train_dir = os.path.join(new_dir, 'train')
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)
            train_dir = os.path.join(train_dir, 'input')
            os.mkdir(train_dir)
        elif delete_previous == True:
            shutil.rmtree(train_dir)
            os.mkdir(train_dir)
            train_dir = os.path.join(train_dir, 'input')
            os.mkdir(train_dir)
        
        # Making val subfolder
        val_dir = os.path.join(new_dir, 'val')
        if not os.path.exists(val_dir):
            os.mkdir(val_dir)
            val_dir = os.path.join(val_dir, 'input')
            os.mkdir(val_dir)
        elif delete_previous == True:
            shutil.rmtree(val_dir)
            os.mkdir(val_dir)
            val_dir = os.path.join(val_dir, 'input')
            os.mkdir(val_dir)
        
        # Making test subfolder
        test_dir = os.path.join(new_dir, 'test')
        if not os.path.exists(test_dir):
            os.mkdir(test_dir)
            test_dir = os.path.join(test_dir, 'input')
            os.mkdir(test_dir)
        elif delete_previous == True:
            shutil.rmtree(test_dir)
            os.mkdir(test_dir)
            test_dir = os.path.join(test_dir, 'input')
            os.mkdir(test_dir)
            
        file_list = os.listdir(input_dir)
        total_num_imgs = len(file_list)
        train_percent = dataset_percentages[0]
        val_percent = dataset_percentages[1]
        test_percent = dataset_percentages[2]
        valid_inputs = (train_percent >= test_percent and train_percent >= val_percent 
                        and train_percent <= 100 and val_percent <= 100 and test_percent <= 100
                        and train_percent > 0 and val_percent > 0 and test_percent > 0 and
                        train_percent + val_percent + test_percent == 100)
        if valid_inputs:
            num_train = int(round(total_num_imgs * train_percent//100))
            num_val = int(round(total_num_imgs * val_percent//100))
        else:
            num_train = int(round(total_num_imgs * 0.9))
            num_val = int(round((total_num_imgs - num_train)/2))
            print('ERROR: Please input valid percentages for dataset division')
            print('In place of a valid input the ratio 90% train, 5% val, 5% test was used')
        
        index = 0
        random.shuffle(file_list)
        for file in file_list:
            filename = os.fsdecode(file)
            filepath = os.path.join(input_dir, filename)
            # Loads File
            if filepath.endswith('.npy'):
                array = np.load(filepath)
                array = array/np.max(array)*scale
            else:
                array = imageio.imread(filepath)
                array = array/np.max(array)*scale
            if index < num_train:
                new_filepath = os.path.join(train_dir, filename)
            elif index <= num_train + num_val:
                new_filepath = os.path.join(val_dir, filename)
            else:
                new_filepath = os.path.join(test_dir, filename)
            # Saves File
            if file_format == '.npy':
                new_filepath = Path(new_filepath)
                new_filepath = new_filepath.with_suffix('')
                new_filepath = new_filepath.with_suffix(file_format)
                np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
            else:
                new_filepath = Path(new_filepath)
                new_filepath = new_filepath.with_suffix('')
                new_filepath = new_filepath.with_suffix(file_format)
                imageio.imwrite(new_filepath, array)
            index += 1
        return train_dir, val_dir, test_dir
    else:
        print('ERROR: Please divide into train/test or train/val/test')
        return None