SVNH_DatasetUtil.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. import numpy as np
  2. import scipy.io as sio
  3. import os.path
  4. train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "SVHN_train_32x32.mat")
  5. test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "SVHN_test_32x32.mat")
  6. extra_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "SVHN_extra_32x32.mat")
  7. #default parameters for argparse
  8. # default_params = {
  9. # "learning_rate": 0.001,
  10. # "num_epochs": 25,
  11. # "batch_size": 128,
  12. # "train_data_file": "./assignment_httm_data/SVHN_train_32x32.mat",
  13. # "test_data_file": "./assignment_httm_data/SVHN_test_32x32.mat",
  14. # "extra_data_file": "./assignment_httm_data/SVHN_extra_32x32.mat",
  15. # "load_extra": False,
  16. # "model": "CNN1",
  17. # "validation_percentage": 0.1,
  18. # "data_shuffle": True,
  19. # "preprocess": False,
  20. # "mode": 'train',
  21. # "runs_name": None,
  22. # "tensorboard_dir": '~/tensorboard_runs'
  23. # }
  24. def load_raw_data(train_data_file, test_data_file, load_extra_data, extra_data_file):
  25. """
  26. Load RAW Google SVHN Digit Localization from .mat files
  27. """
  28. loading_information = "with Extra" if load_extra_data else "without Extra"
  29. print("Loading SVHN dataset {}...".format(loading_information))
  30. raw_train_data = sio.loadmat(train_data_file)
  31. raw_test_data = sio.loadmat(test_data_file)
  32. if load_extra_data:
  33. raw_extra_data = sio.loadmat(extra_data_file)
  34. print("Train size: {}, Test size: {}, Extra size: {}".format(raw_train_data['X'].shape[3],
  35. raw_test_data['X'].shape[3],
  36. raw_extra_data['X'].shape[3]))
  37. return [raw_train_data, raw_test_data, raw_extra_data]
  38. else:
  39. print("Train size: {}, Test size: {}".format(raw_train_data['X'].shape[3],
  40. raw_test_data['X'].shape[3]))
  41. return [raw_train_data, raw_test_data]
  42. def format_data(raw_data, number_of_examples):
  43. """
  44. Reshape RAW data to regular shape
  45. """
  46. old_shape = raw_data.shape
  47. new_data = []
  48. for i in range(number_of_examples):
  49. new_data.append(raw_data[:, :, :, i])
  50. new_data = np.asarray(new_data)
  51. print("Data has been reshaped from {} to {}".format(raw_data.shape, new_data.shape))
  52. return new_data / 255.
  53. def one_hot_encoder(data, number_of_labels):
  54. """
  55. One-hot encoder for labels
  56. """
  57. data_size = len(data)
  58. one_hot_matrix = np.zeros(shape=(data_size, number_of_labels))
  59. for i in range(data_size):
  60. current_row = np.zeros(shape=(number_of_labels))
  61. current_number = data[i][0]
  62. if current_number == 10:
  63. current_row[0] = 1
  64. else:
  65. current_row[current_number] = 1
  66. one_hot_matrix[i] = current_row
  67. return one_hot_matrix
  68. def load_svhn_data(train_path, test_path, extra_path, load_extra, eval_percentage):
  69. """
  70. Load SVHN Dataset
  71. """
  72. print("Loading SVHN dataset for classification...")
  73. # Load raw dataset
  74. if load_extra:
  75. print("Found extra dataset, loading it...")
  76. train, test, extra = load_raw_data(train_path, test_path, load_extra, extra_path)
  77. train['X'] = np.concatenate((train['X'], extra['X']), axis=3)
  78. train['y'] = np.concatenate((train['y'], extra['y']), axis=0)
  79. else:
  80. train, test = load_raw_data(train_path, test_path, load_extra, extra_path)
  81. # get values and labels
  82. train_all_values = format_data(train['X'], train['X'].shape[3])
  83. train_all_labels = one_hot_encoder(train['y'], 10)
  84. test_values = format_data(test['X'], test['X'].shape[3])
  85. test_labels = one_hot_encoder(test['y'], 10)
  86. np.random.seed(41)
  87. shuffle_indices = np.random.permutation(np.arange(len(train_all_values)))
  88. train_values_shuffled = train_all_values[shuffle_indices]
  89. train_labels_shuffled = train_all_labels[shuffle_indices]
  90. # Seperate into training and eval set
  91. # Original setting split the data into training and validation samples
  92. train_index = -1 * int(eval_percentage * float(len(train_values_shuffled)))
  93. train_values, eval_values = train_values_shuffled[:train_index], train_values_shuffled[train_index:]
  94. train_labels, eval_labels = train_labels_shuffled[:train_index], train_labels_shuffled[train_index:]
  95. print("Train/Eval split: {:d}/{:d}".format(len(train_labels), len(eval_labels)))
  96. print("Loading data completed")
  97. return [train_values, train_labels, eval_values, eval_labels, test_values, test_labels]
  98. def my_load_svhn_data(train_path, test_path, extra_path, load_extra):
  99. """
  100. Load SVHN Dataset
  101. """
  102. print("Loading SVHN dataset for classification...")
  103. # Load raw dataset
  104. if load_extra:
  105. print("Found extra dataset, loading it...")
  106. train, test, extra = load_raw_data(train_path, test_path, load_extra, extra_path)
  107. train['X'] = np.concatenate((train['X'], extra['X']), axis=3)
  108. train['y'] = np.concatenate((train['y'], extra['y']), axis=0)
  109. else:
  110. train, test = load_raw_data(train_path, test_path, load_extra, extra_path)
  111. # get values and labels
  112. train_all_values = format_data(train['X'], train['X'].shape[3])
  113. train_all_labels = one_hot_encoder(train['y'], 10)
  114. test_values = format_data(test['X'], test['X'].shape[3])
  115. test_labels = one_hot_encoder(test['y'], 10)
  116. np.random.seed(41)
  117. shuffle_indices = np.random.permutation(np.arange(len(train_all_values)))
  118. train_values_shuffled = train_all_values[shuffle_indices]
  119. train_labels_shuffled = train_all_labels[shuffle_indices]
  120. print("Loading data completed")
  121. return train_values_shuffled, train_labels_shuffled, test_values, test_labels
  122. # Seperate into training and eval set
  123. # # Original setting split the data into training and validation samples
  124. # train_index = -1 * int(eval_percentage * float(len(train_values_shuffled)))
  125. # train_values, eval_values = train_values_shuffled[:train_index], train_values_shuffled[train_index:]
  126. # train_labels, eval_labels = train_labels_shuffled[:train_index], train_labels_shuffled[train_index:]
  127. # print("Train/Eval split: {:d}/{:d}".format(len(train_labels), len(eval_labels)))
  128. # print("Loading data completed")
  129. # return [train_values, train_labels, eval_values, eval_labels, test_values, test_labels]
  130. def load_data():
  131. train_X, train_Y, test_X, test_Y = my_load_svhn_data(train_path = train_path,
  132. test_path = test_path,
  133. extra_path = extra_path,
  134. load_extra = False)
  135. return (train_X, train_Y), (test_X, test_Y)
  136. if __name__ == "__main__":
  137. # train_X, train_Y, eval_X, eval_Y, test_X, test_Y = load_svhn_data(train_path = train_path,
  138. # test_path = test_path,
  139. # extra_path = extra_path,
  140. # load_extra = True,
  141. # eval_percentage = 0.1
  142. # )
  143. (train_X, train_Y), (test_X, test_Y) = load_data()
  144. print(np.shape(train_X))
  145. print(np.shape(train_Y))
  146. print(np.shape(test_X))
  147. print(np.shape(test_Y))