data_preprocessing_utils.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. # Customary Imports:
  2. import tensorflow as tf
  3. assert '2.' in tf.__version__ # make sure you're using tf 2.0
  4. import numpy as np
  5. import matplotlib.pyplot as plt
  6. import pandas as pd
  7. import sklearn
  8. import skimage
  9. import cv2 as cv
  10. import os
  11. import datetime
  12. import scipy
  13. from skimage.morphology import reconstruction
  14. from skimage import exposure
  15. import scipy.io as sio
  16. import h5py
  17. import random
  18. import shutil
  19. import PIL
  20. import imageio
  21. import pydot
  22. import graphviz
  23. import plotly.graph_objects as go
  24. import preprocess_crop
  25. from pathlib import Path
  26. from tensorflow.keras import backend as K
  27. from PIL import Image
  28. from keras.preprocessing.image import ImageDataGenerator
  29. from tensorflow.keras.layers import Dense, Flatten, Conv2D
  30. from tensorflow.keras import Model
  31. #from keras.utils import CustomObjectScope
  32. from mpl_toolkits.mplot3d import Axes3D
  33. import data_preprocessing_utils
  34. ##################################################################################################################################
  35. '''
  36. DATA PREPROCESSING UTILS:
  37. '''
  38. ##################################################################################################################################
  39. # Converting MAP Files:
  40. def convert_MAP(directory, output_directory, min_shape, file_format = '.npy', search_keys = None, dtype = np.float32):
  41. '''
  42. This program loops through given raw_data directory
  43. and converts .mat files to .npy files
  44. '''
  45. new_dir = os.path.join(os.getcwd(), output_directory)
  46. if not os.path.exists(new_dir):
  47. os.mkdir(new_dir)
  48. else:
  49. shutil.rmtree(new_dir)
  50. os.mkdir(new_dir)
  51. for file in os.listdir(directory):
  52. filename = os.fsdecode(file)
  53. if filename.endswith(".mat"):
  54. #print(os.path.join(directory, filename))
  55. filepath = os.path.join(directory, filename)
  56. array_dict = {}
  57. try:
  58. f = h5py.File(filepath, 'r')
  59. except:
  60. f = sio.loadmat(filepath)
  61. for k, v in f.items():
  62. array_dict[k] = np.array(v, dtype = np.float32)
  63. # As we only need image info from dict (the last key) we do this
  64. if search_keys == None:
  65. search_keys = 'map' # out of struct of .mat files want "map"
  66. filtered_dict = dict(filter(lambda item: search_keys in item[0], array_dict.items()))
  67. else:
  68. filtered_dict = {}
  69. for i in range(len(search_keys)):
  70. search_key = search_keys[i]
  71. if search_key in array_dict:
  72. filtered_dict[search_key] = array_dict[search_key]
  73. if len(filtered_dict) == 0:
  74. print('No Data to Meet Search Key Requirements: Datapoint Rejected -> ' + filepath)
  75. else:
  76. #print(list(array_dict.keys()))
  77. #print(filtered_dict)
  78. arrays = []
  79. for k, v in filtered_dict.items():
  80. temp = np.transpose(v.astype(np.float32))
  81. # To normalize data between [-1,1], use -> arrays = arrays/(np.max(arrays)/2) - 1
  82. # To normalize data between [0,1], use -> arrays = arrays/(np.max(arrays))
  83. # To normalize data between [0,255],
  84. # use -> arrays = (arrays/(np.max(arrays))*255).astype(np.uint8)
  85. temp = temp/(np.max(temp))
  86. arrays.append(temp)
  87. for i in range(len(arrays)):
  88. if len(arrays[i].shape) > 2:
  89. #print(arrays[i].shape)
  90. arrays[i] = np.mean(arrays[i], axis = 2)
  91. for i in range(len(arrays)):
  92. new_dir_filepath = os.path.join(new_dir, filename.strip('.mat')
  93. + '_index'+str(i) + file_format)
  94. array = arrays[i]
  95. if array.shape[0] >= min_shape[0] and array.shape[1] >= min_shape[1]:
  96. if file_format == '.npy':
  97. np.save(new_dir_filepath, array, allow_pickle=True, fix_imports=True)
  98. else:
  99. imageio.imwrite(new_dir_filepath, array)
  100. elif i == 0:
  101. print('Min Size Not Met: Datapoint Rejected -> ' + filepath)
  102. return os.path.join(os.getcwd(), output_directory)
  103. ##################################################################################################################################
  104. # Data Cleaning Procedures:
  105. def data_clean_func(image = None):
  106. if image is not None:
  107. #print(len(np.unique(image)))
  108. #clean_image = image
  109. '''
  110. plt.hist(image)
  111. plt.show()
  112. '''
  113. '''
  114. plt.imshow(image, cmap='gray')
  115. plt.title('Original Image')
  116. plt.show()
  117. '''
  118. threshold = 0.85
  119. default_fill = 0.0
  120. frac_of_high_clip = 1/9
  121. image[image > threshold] = default_fill
  122. image[image < frac_of_high_clip*(1.0-threshold)] = default_fill
  123. '''
  124. plt.imshow(image, cmap='gray')
  125. plt.title('After Clipping')
  126. plt.show()
  127. '''
  128. image = scipy.ndimage.median_filter(image, size=(4, 4))
  129. '''
  130. plt.imshow(image, cmap='gray')
  131. plt.title('After Median Filter')
  132. plt.show()
  133. '''
  134. image = skimage.filters.gaussian(image, sigma=0.01, output=None, mode='reflect', preserve_range=True)
  135. ####################################################################
  136. # Added to ensure negligible loss when converting to int16
  137. # within exposure.equalize_adapthist
  138. image = (image/np.max(image)*(2**16)).astype(np.uint16)
  139. # A "Monkey Patch" could possibly be used as a cleaner solution,
  140. # but would be more involved than is necessary for my application
  141. ####################################################################
  142. image = exposure.equalize_adapthist(image,kernel_size=image.shape[0]//8, clip_limit=0.005, nbins=2**13)
  143. image = image.astype(np.float64)
  144. '''
  145. plt.imshow(image, cmap='gray')
  146. plt.title('After Local Adapt Hist')
  147. plt.show()
  148. '''
  149. image = scipy.ndimage.median_filter(image, size=(3, 1))
  150. image = scipy.ndimage.median_filter(image, size=(1, 3))
  151. image = skimage.filters.gaussian(image, sigma=0.1, output=None, mode='reflect', preserve_range=True)
  152. image = exposure.rescale_intensity(image, in_range='image', out_range=(0.0,1.0))
  153. '''
  154. plt.imshow(image, cmap='gray')
  155. plt.title('Final Image')
  156. plt.show()
  157. '''
  158. '''
  159. plt.hist(image)
  160. plt.show()
  161. '''
  162. clean_image = image.astype(np.float32)
  163. else:
  164. clean_image = image
  165. return clean_image
  166. def data_cleaning(input_dir = 'converted_data', output_dir_name = 'cleaned_data',
  167. output_file_format ='.npy', delete_previous = True):
  168. '''
  169. This program seeks to remove some noise from the data
  170. and make the underlying vessel structure more prominent
  171. Input: input_dir -> directory that holds data to be cleaned
  172. output_dir -> directory to hold cleaned data
  173. Output: None
  174. '''
  175. file_list = os.listdir(input_dir)
  176. clean_dir = os.path.join(os.getcwd(), output_dir_name)
  177. if not os.path.exists(clean_dir):
  178. os.mkdir(clean_dir)
  179. elif delete_previous == True:
  180. shutil.rmtree(clean_dir)
  181. os.mkdir(clean_dir)
  182. for file in file_list:
  183. filename = os.fsdecode(file)
  184. filepath = os.path.join(input_dir, filename)
  185. if filepath.endswith('.npy'):
  186. array = np.load(filepath)
  187. else:
  188. array = imageio.imread(filepath)
  189. # Defined data clean function above:
  190. array = data_preprocessing_utils.data_clean_func(array)
  191. new_filepath = os.path.join(clean_dir, filename)
  192. if output_file_format == '.npy':
  193. new_filepath = Path(new_filepath)
  194. new_filepath = new_filepath.with_suffix('')
  195. new_filepath = new_filepath.with_suffix(output_file_format)
  196. np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
  197. else:
  198. new_filepath = Path(new_filepath)
  199. new_filepath = new_filepath.with_suffix('')
  200. new_filepath = new_filepath.with_suffix(output_file_format)
  201. imageio.imwrite(new_filepath, array)
  202. return
  203. ##################################################################################################################################
  204. # Data Seperation / Validation Split Procedures:
  205. def data_seperation(input_dir, dataset_percentages,
  206. delete_previous = False, file_format = '.npy',
  207. scale = 1):
  208. '''
  209. Takes numpy array and creates data folder with seperate sections
  210. for training, validation, and testing according to given percentages
  211. Input: numpy dir -> contains file path to data folder of numpy files
  212. dataset_percentages -> (% train, % test) such that % train + % test = 100
  213. OR
  214. dataset_percentages -> (% train, % val, % test) such that % train + % val + % test = 100
  215. Output: new folders for training and testing or training/validation/testing
  216. '''
  217. # If just train and test
  218. if len(dataset_percentages) == 2:
  219. # Making Main data folder
  220. new_dir = os.path.join(os.getcwd(), 'data')
  221. if not os.path.exists(new_dir):
  222. os.mkdir(new_dir)
  223. # Making train subfolder
  224. train_dir = os.path.join(new_dir, 'train')
  225. if not os.path.exists(train_dir):
  226. os.mkdir(train_dir)
  227. train_dir = os.path.join(train_dir, 'input')
  228. os.mkdir(train_dir)
  229. elif delete_previous == True:
  230. shutil.rmtree(train_dir)
  231. os.mkdir(train_dir)
  232. train_dir = os.path.join(train_dir, 'input')
  233. os.mkdir(train_dir)
  234. # Making test subfolder
  235. test_dir = os.path.join(new_dir, 'test')
  236. if not os.path.exists(test_dir):
  237. os.mkdir(test_dir)
  238. test_dir = os.path.join(test_dir, 'input')
  239. os.mkdir(test_dir)
  240. elif delete_previous == True:
  241. shutil.rmtree(test_dir)
  242. os.mkdir(test_dir)
  243. test_dir = os.path.join(test_dir, 'input')
  244. os.mkdir(test_dir)
  245. file_list = os.listdir(input_dir)
  246. total_num_imgs = len(file_list)
  247. train_percent = dataset_percentages[0]
  248. test_percent = dataset_percentages[1]
  249. valid_inputs = (train_percent >= test_percent and train_percent <= 100 and
  250. test_percent <= 100 and train_percent > 0 and test_percent > 0 and
  251. train_percent + test_percent == 100)
  252. if valid_inputs:
  253. num_train = int(round(total_num_imgs * train_percent//100))
  254. else:
  255. num_train = int(round(total_num_imgs * 0.9))
  256. print('ERROR: Please input valid percentages for dataset division')
  257. print('In place of valid input the ratio 90% train, 10% test was used')
  258. index = 0
  259. random.shuffle(file_list)
  260. for file in file_list:
  261. filename = os.fsdecode(file)
  262. filepath = os.path.join(input_dir, filename)
  263. # Loads File
  264. if filepath.endswith('.npy'):
  265. array = np.load(filepath)
  266. array = array/np.max(array)*scale
  267. else:
  268. array = imageio.imread(filepath)
  269. array = array/np.max(array)*scale
  270. if index < num_train:
  271. new_filepath = os.path.join(train_dir, filename)
  272. else:
  273. new_filepath = os.path.join(test_dir, filename)
  274. # Saves File
  275. if file_format == '.npy':
  276. new_filepath = Path(new_filepath)
  277. new_filepath = new_filepath.with_suffix('')
  278. new_filepath = new_filepath.with_suffix(file_format)
  279. np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
  280. else:
  281. new_filepath = Path(new_filepath)
  282. new_filepath = new_filepath.with_suffix('')
  283. new_filepath = new_filepath.with_suffix(file_format)
  284. imageio.imwrite(new_filepath, array)
  285. index += 1
  286. return train_dir, test_dir
  287. # If train, val, and test
  288. elif len(dataset_percentages) == 3:
  289. # Making Main data folder
  290. new_dir = os.path.join(os.getcwd(), 'data')
  291. if not os.path.exists(new_dir):
  292. os.mkdir(new_dir)
  293. # Making train subfolder
  294. train_dir = os.path.join(new_dir, 'train')
  295. if not os.path.exists(train_dir):
  296. os.mkdir(train_dir)
  297. train_dir = os.path.join(train_dir, 'input')
  298. os.mkdir(train_dir)
  299. elif delete_previous == True:
  300. shutil.rmtree(train_dir)
  301. os.mkdir(train_dir)
  302. train_dir = os.path.join(train_dir, 'input')
  303. os.mkdir(train_dir)
  304. # Making val subfolder
  305. val_dir = os.path.join(new_dir, 'val')
  306. if not os.path.exists(val_dir):
  307. os.mkdir(val_dir)
  308. val_dir = os.path.join(val_dir, 'input')
  309. os.mkdir(val_dir)
  310. elif delete_previous == True:
  311. shutil.rmtree(val_dir)
  312. os.mkdir(val_dir)
  313. val_dir = os.path.join(val_dir, 'input')
  314. os.mkdir(val_dir)
  315. # Making test subfolder
  316. test_dir = os.path.join(new_dir, 'test')
  317. if not os.path.exists(test_dir):
  318. os.mkdir(test_dir)
  319. test_dir = os.path.join(test_dir, 'input')
  320. os.mkdir(test_dir)
  321. elif delete_previous == True:
  322. shutil.rmtree(test_dir)
  323. os.mkdir(test_dir)
  324. test_dir = os.path.join(test_dir, 'input')
  325. os.mkdir(test_dir)
  326. file_list = os.listdir(input_dir)
  327. total_num_imgs = len(file_list)
  328. train_percent = dataset_percentages[0]
  329. val_percent = dataset_percentages[1]
  330. test_percent = dataset_percentages[2]
  331. valid_inputs = (train_percent >= test_percent and train_percent >= val_percent
  332. and train_percent <= 100 and val_percent <= 100 and test_percent <= 100
  333. and train_percent > 0 and val_percent > 0 and test_percent > 0 and
  334. train_percent + val_percent + test_percent == 100)
  335. if valid_inputs:
  336. num_train = int(round(total_num_imgs * train_percent//100))
  337. num_val = int(round(total_num_imgs * val_percent//100))
  338. else:
  339. num_train = int(round(total_num_imgs * 0.9))
  340. num_val = int(round((total_num_imgs - num_train)/2))
  341. print('ERROR: Please input valid percentages for dataset division')
  342. print('In place of a valid input the ratio 90% train, 5% val, 5% test was used')
  343. index = 0
  344. random.shuffle(file_list)
  345. for file in file_list:
  346. filename = os.fsdecode(file)
  347. filepath = os.path.join(input_dir, filename)
  348. # Loads File
  349. if filepath.endswith('.npy'):
  350. array = np.load(filepath)
  351. array = array/np.max(array)*scale
  352. else:
  353. array = imageio.imread(filepath)
  354. array = array/np.max(array)*scale
  355. if index < num_train:
  356. new_filepath = os.path.join(train_dir, filename)
  357. elif index <= num_train + num_val:
  358. new_filepath = os.path.join(val_dir, filename)
  359. else:
  360. new_filepath = os.path.join(test_dir, filename)
  361. # Saves File
  362. if file_format == '.npy':
  363. new_filepath = Path(new_filepath)
  364. new_filepath = new_filepath.with_suffix('')
  365. new_filepath = new_filepath.with_suffix(file_format)
  366. np.save(new_filepath, array, allow_pickle=True, fix_imports=True)
  367. else:
  368. new_filepath = Path(new_filepath)
  369. new_filepath = new_filepath.with_suffix('')
  370. new_filepath = new_filepath.with_suffix(file_format)
  371. imageio.imwrite(new_filepath, array)
  372. index += 1
  373. return train_dir, val_dir, test_dir
  374. else:
  375. print('ERROR: Please divide into train/test or train/val/test')
  376. return None