Practica2.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. # -*- coding: utf-8 -*-
  2. """
  3. Autor:
  4. Francisco Solano López Rodríguez
  5. Fecha:
  6. Noviembre/2018
  7. Contenido:
  8. Practica 2 Clustering
  9. Inteligencia de Negocio
  10. Grado en Ingeniería Informática
  11. Universidad de Granada
  12. """
  13. import time
  14. import matplotlib.pyplot as plt
  15. import pandas as pd
  16. import numpy as np
  17. from sklearn.cluster import KMeans
  18. from sklearn.cluster import KMeans, AgglomerativeClustering,estimate_bandwidth
  19. from sklearn.cluster import Birch,SpectralClustering,MeanShift,DBSCAN, MiniBatchKMeans
  20. from sklearn import metrics
  21. from sklearn import preprocessing
  22. from math import floor
  23. import seaborn as sns
  24. from scipy.cluster.hierarchy import dendrogram,ward
  25. seed = 12345
  26. ################### FUNCIONES ###########################
  27. def getPrediction(algorithm, X):
  28. t = time.time()
  29. cluster_predict = algorithm.fit_predict(X)
  30. tiempo = time.time() - t
  31. return cluster_predict, tiempo
  32. # Función para obtener las medias de cada cluster
  33. def getMeans(dataFrame):
  34. return dataFrame.groupby("cluster").mean()
  35. # Función para obtener las desviaciones de cada cluster
  36. def getStd(dataFrame):
  37. return dataFrame.groupby("cluster").std()
  38. # Función para pintar Scatter Matrix
  39. def DrawScatterMatrix(data, name=None, display=True, save=False):
  40. sns.set()
  41. variables = list(data)
  42. variables.remove('cluster')
  43. sns_plot = sns.pairplot(data, vars=variables, hue="cluster", palette='Paired', plot_kws={"s": 25},
  44. diag_kind="hist")
  45. sns_plot.fig.subplots_adjust(wspace=.03, hspace=.03)
  46. if name != None:
  47. plt.title("scatter_"+name)
  48. # Mostrar imagen por pantalla
  49. if display:
  50. plt.show()
  51. # Guardar imagen en memoria
  52. if save:
  53. if name == None:
  54. name = "_unknown_"
  55. image_name = "scatter/scatter_" + name + ".png"
  56. plt.savefig(image_name)
  57. plt.clf()
  58. print("Imagen guardada: ", image_name)
  59. # Función para pintar heatmap
  60. def DrawHeatmap(data, name = None, display=True, save = False):
  61. data_normal = data.apply(norm_to_zero_one)
  62. meanDF = getMeans(dataFrame = data_normal)
  63. hm = sns.heatmap(data=meanDF, linewidths=.1, cmap="Blues", annot=True, xticklabels='auto')
  64. plt.xticks(rotation=0)
  65. plt.title("heatmap_"+name)
  66. if name != None:
  67. plt.title("heatmap_"+name)
  68. # Mostrar imagen por pantalla
  69. if display:
  70. plt.show()
  71. # Guardar imagen en memoria
  72. if save:
  73. if name == None:
  74. name = "_unknown_"
  75. image_name = "heatmap/heatmap_" + name + ".png"
  76. plt.savefig(image_name)
  77. plt.clf()
  78. print("Imagen guardada: ", image_name)
  79. # Función para pintar dendograma
  80. def DrawDendrogram(data, name = None, display=True, save = False):
  81. data_normal = preprocessing.normalize(data,norm='l2')
  82. linkage_array = ward(X_normal)
  83. dendrogram(linkage_array,leaf_rotation=90., leaf_font_size=5.)
  84. if name != None:
  85. plt.title("dendograma_" + name)
  86. # Mostrar imagen por pantalla
  87. if display:
  88. plt.show()
  89. # Guardar imagen en memoria
  90. if save:
  91. if name == None:
  92. name = "_unknown_"
  93. image_name = "dendrogram/dendrogram_" + name + ".png"
  94. plt.savefig(image_name)
  95. plt.clf()
  96. print("Imagen guardada: ", image_name)
  97. def dataFrameResultados(algoritmos, num_cluster, metrics_CH, metrics_SC, tiempos):
  98. df_algo = pd.DataFrame(algoritmos, columns=['Algoritmo'])
  99. df_nc = pd.DataFrame(num_cluster, columns=['Num. Clusters'])
  100. df_CH = pd.DataFrame(metrics_CH, columns=['CH'])
  101. df_SC = pd.DataFrame(metrics_SC, columns=['SH'])
  102. df_t = pd.DataFrame(tiempos, columns=['Tiempo'])
  103. resultados = pd.concat([df_algo, df_nc, df_CH, df_SC, df_t], axis=1)
  104. return resultados
  105. def norm_to_zero_one(df):
  106. return (df - df.min()) * 1.0 / (df.max() - df.min())
  107. def executeClustering(algorithms, X, caso):
  108. f = open("caso_" + str(caso) + ".txt", 'w')
  109. X_normal = X.apply(norm_to_zero_one)
  110. names = []
  111. num_cluster = []
  112. metrics_CH = []
  113. metrics_SC = []
  114. tiempos = []
  115. print("\nCaso de estudio ", caso, ", tamaño: ", len(X))
  116. f.write("\nCaso de estudio " + str(caso) + ", tamaño: " + str(len(X)))
  117. for algorithm, name_algorithm in algorithms:
  118. print("\n----------------------------------------\n")
  119. print("Ejecutando algoritmo: ", name_algorithm, "\n")
  120. f.write("\n--------------------------------------\n")
  121. f.write("Ejecutando algoritmo: " + name_algorithm + "\n")
  122. # Ejecución algoritmo clustering
  123. cluster_predict, tiempo = getPrediction(algorithm, X_normal)
  124. # Pasar las predicciones a dataFrame
  125. clusters = pd.DataFrame(cluster_predict,index=X.index,columns=['cluster'])
  126. print("Tamaño de cada cluster:")
  127. f.write("\nTamaño de cada cluster:\n")
  128. size=clusters['cluster'].value_counts()
  129. for num,i in size.iteritems():
  130. print('%s: %5d (%5.2f%%)' % (num,i,100*i/len(clusters)))
  131. f.write('%s: %5d (%5.2f%%)\n' % (num,i,100*i/len(clusters)))
  132. print()
  133. # Obtener los resultados de las métricas
  134. metric_CH = metrics.calinski_harabaz_score(X_normal, cluster_predict)
  135. metric_SC = metrics.silhouette_score(X_normal, cluster_predict, metric='euclidean',
  136. sample_size=floor(0.2*len(X)), random_state=seed)
  137. # Guardamos el nombre del algoritmo, número de cluster,
  138. # los tiempos y las métricas para la posterior comparacion
  139. names.append(name_algorithm)
  140. num_cluster.append(len(set(cluster_predict)))
  141. metrics_CH.append(metric_CH)
  142. metrics_SC.append(metric_SC)
  143. tiempos.append(tiempo)
  144. # Se añade la asignación de clusters como columna a X
  145. X_cluster = pd.concat([X, clusters], axis=1)
  146. X_normal_cluster = pd.concat([X_normal, clusters], axis=1)
  147. name = "caso_" + str(caso) + "_" + name_algorithm
  148. # Pintamos el scatter matrix
  149. DrawScatterMatrix(data = X_cluster, name = name, display = False, save = True)
  150. # Pintamos el heatmap
  151. DrawHeatmap(data = X_cluster, name = name, display = False, save = True)
  152. # DataFrame con la media de cada característica en cada cluster
  153. meanDF = getMeans(dataFrame = X_cluster)
  154. print()
  155. print(meanDF)
  156. f.write(meanDF.to_string())
  157. # Si el algoritmo es AgglomerativeClustering pintamos el dendograma
  158. if name_algorithm == 'AC':
  159. DrawDendrogram(data = X_cluster, name = name, display = False, save = True)
  160. resultados = dataFrameResultados(names, num_cluster, metrics_CH, metrics_SC, tiempos)
  161. print("\n**************************************\n")
  162. print(resultados.to_string())
  163. print("\n**************************************\n")
  164. f.write("\n**************************************\n")
  165. f.write(resultados.to_string())
  166. f.write("\n**************************************\n")
  167. f.close()
  168. #########################################################
  169. # Lectura datos
  170. print("Leyendo el conjunto de datos...")
  171. censo = pd.read_csv('censo_granada.csv')
  172. censo = censo.replace(np.NaN,0)
  173. print("Lectura completada.")
  174. ###### CASOS DE ESTUDIO ######
  175. #-------- CASO 1 --------
  176. casado = 2
  177. hombre = 1
  178. mujer = 6
  179. subset = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
  180. usadas = ['EDAD', 'NPFAM', 'HM5', 'H0515']
  181. X = subset[usadas]
  182. X_normal = preprocessing.normalize(X, norm='l2')
  183. #-------- CASO 2 --------
  184. subset_2 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==hombre)]
  185. usadas_2 = ['EDAD', 'NPFAM', 'HM5', 'H0515']
  186. X_2 = subset_2[usadas_2]
  187. X_normal_2 = X_2.apply(norm_to_zero_one)
  188. #-------- CASO 3 --------
  189. subset_3 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
  190. usadas_3 = ['EDAD', 'NPFAM', 'NHIJOS', 'ESREAL']
  191. X_3 = subset_3[usadas_3]
  192. X_normal_3 = X_3.apply(norm_to_zero_one)
  193. ###############################
  194. # Obtener la correlación entre las variables
  195. '''
  196. correlation = X.corr()
  197. sns.heatmap(correlation, square = True)
  198. plt.show()
  199. '''
  200. #################### Algoritmos #####################
  201. random_seed = 123
  202. k_means = KMeans(init='k-means++', n_clusters=5, n_init=5, random_state=random_seed)
  203. agglo=AgglomerativeClustering(n_clusters=5,linkage="ward")
  204. meanshift = MeanShift(bin_seeding=True)
  205. miniBatchKMeans = MiniBatchKMeans(init='k-means++',n_clusters=4, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
  206. dbscan = DBSCAN(eps=0.2)
  207. dbscan2 = DBSCAN(eps=0.1)
  208. algorithms = [(k_means, "KMeans"),
  209. (agglo, "AC"),
  210. (meanshift, "MeanShift"),
  211. (miniBatchKMeans, "MiniBatchKM"),
  212. (dbscan, "DBSCAN")]
  213. algorithms2 = [(k_means, "KMeans"),
  214. (agglo, "AC"),
  215. (meanshift, "MeanShift"),
  216. (miniBatchKMeans, "MiniBatchKM"),
  217. (dbscan2, "DBSCAN2")]
  218. # Kmeans con diferentes números de cluster
  219. algorithm_kmeans = []
  220. for i in range(5,9):
  221. kmeans_i = KMeans(init='k-means++', n_clusters=i, n_init=5)
  222. algorithm_kmeans.append((kmeans_i, "KMeans_" + str(i)))
  223. # AgglomerativeClustering con diferentes números de cluster
  224. algorithm_AC = []
  225. for i in range(5,9):
  226. agglo_i = AgglomerativeClustering(n_clusters=i,linkage="ward")
  227. algorithm_AC.append((agglo_i, "AC_" + str(i)))
  228. # MiniBatchKmeans con diferentes números de cluster
  229. algorithm_miniBatch = []
  230. for i in range(5,9):
  231. miniBatch_i = MiniBatchKMeans(init='k-means++',n_clusters=i, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
  232. algorithm_miniBatch.append((miniBatch_i, "MiniBatchKM_" + str(i)))
  233. #-----------------------------------------------------#
  234. # EJECUCIÓN CASO 1
  235. executeClustering(algorithms, X, 1)
  236. executeClustering(algorithm_kmeans, X, 1.1)
  237. executeClustering(algorithm_AC, X, 1.2)
  238. # EJECUCIÓN CASO 2
  239. executeClustering(algorithms, X_2, 2)
  240. executeClustering(algorithm_kmeans, X_2, 2.1)
  241. executeClustering(algorithm_miniBatch, X_2, 2.2)
  242. # EJECUCIÓN CASO 3
  243. executeClustering(algorithms2, X_3, 3)
  244. executeClustering(algorithm_kmeans, X_3, 3.1)
  245. executeClustering(algorithm_miniBatch, X_3, 3.2)