123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336 |
- """
- Autor:
- Francisco Solano López Rodríguez
- Fecha:
- Noviembre/2018
- Contenido:
- Practica 2 Clustering
- Inteligencia de Negocio
- Grado en Ingeniería Informática
- Universidad de Granada
- """
- import time
- import matplotlib.pyplot as plt
- import pandas as pd
- import numpy as np
- from sklearn.cluster import KMeans
- from sklearn.cluster import KMeans, AgglomerativeClustering,estimate_bandwidth
- from sklearn.cluster import Birch,SpectralClustering,MeanShift,DBSCAN, MiniBatchKMeans
- from sklearn import metrics
- from sklearn import preprocessing
- from math import floor
- import seaborn as sns
- from scipy.cluster.hierarchy import dendrogram,ward
- seed = 12345
- def getPrediction(algorithm, X):
- t = time.time()
- cluster_predict = algorithm.fit_predict(X)
- tiempo = time.time() - t
- return cluster_predict, tiempo
- def getMeans(dataFrame):
- return dataFrame.groupby("cluster").mean()
- def getStd(dataFrame):
- return dataFrame.groupby("cluster").std()
- def DrawScatterMatrix(data, name=None, display=True, save=False):
- sns.set()
- variables = list(data)
- variables.remove('cluster')
- sns_plot = sns.pairplot(data, vars=variables, hue="cluster", palette='Paired', plot_kws={"s": 25},
- diag_kind="hist")
- sns_plot.fig.subplots_adjust(wspace=.03, hspace=.03)
- if name != None:
- plt.title("scatter_"+name)
-
- if display:
- plt.show()
-
- if save:
- if name == None:
- name = "_unknown_"
- image_name = "scatter/scatter_" + name + ".png"
- plt.savefig(image_name)
- plt.clf()
- print("Imagen guardada: ", image_name)
- def DrawHeatmap(data, name = None, display=True, save = False):
- data_normal = data.apply(norm_to_zero_one)
- meanDF = getMeans(dataFrame = data_normal)
- hm = sns.heatmap(data=meanDF, linewidths=.1, cmap="Blues", annot=True, xticklabels='auto')
- plt.xticks(rotation=0)
- plt.title("heatmap_"+name)
- if name != None:
- plt.title("heatmap_"+name)
-
- if display:
- plt.show()
-
- if save:
- if name == None:
- name = "_unknown_"
- image_name = "heatmap/heatmap_" + name + ".png"
- plt.savefig(image_name)
- plt.clf()
- print("Imagen guardada: ", image_name)
- def DrawDendrogram(data, name = None, display=True, save = False):
- data_normal = preprocessing.normalize(data,norm='l2')
- linkage_array = ward(X_normal)
- dendrogram(linkage_array,leaf_rotation=90., leaf_font_size=5.)
-
- if name != None:
- plt.title("dendograma_" + name)
-
- if display:
- plt.show()
-
- if save:
- if name == None:
- name = "_unknown_"
- image_name = "dendrogram/dendrogram_" + name + ".png"
- plt.savefig(image_name)
- plt.clf()
- print("Imagen guardada: ", image_name)
- def dataFrameResultados(algoritmos, num_cluster, metrics_CH, metrics_SC, tiempos):
- df_algo = pd.DataFrame(algoritmos, columns=['Algoritmo'])
- df_nc = pd.DataFrame(num_cluster, columns=['Num. Clusters'])
- df_CH = pd.DataFrame(metrics_CH, columns=['CH'])
- df_SC = pd.DataFrame(metrics_SC, columns=['SH'])
- df_t = pd.DataFrame(tiempos, columns=['Tiempo'])
- resultados = pd.concat([df_algo, df_nc, df_CH, df_SC, df_t], axis=1)
- return resultados
- def norm_to_zero_one(df):
- return (df - df.min()) * 1.0 / (df.max() - df.min())
- def executeClustering(algorithms, X, caso):
- f = open("caso_" + str(caso) + ".txt", 'w')
- X_normal = X.apply(norm_to_zero_one)
- names = []
- num_cluster = []
- metrics_CH = []
- metrics_SC = []
- tiempos = []
- print("\nCaso de estudio ", caso, ", tamaño: ", len(X))
- f.write("\nCaso de estudio " + str(caso) + ", tamaño: " + str(len(X)))
- for algorithm, name_algorithm in algorithms:
- print("\n----------------------------------------\n")
- print("Ejecutando algoritmo: ", name_algorithm, "\n")
- f.write("\n--------------------------------------\n")
- f.write("Ejecutando algoritmo: " + name_algorithm + "\n")
-
- cluster_predict, tiempo = getPrediction(algorithm, X_normal)
-
- clusters = pd.DataFrame(cluster_predict,index=X.index,columns=['cluster'])
- print("Tamaño de cada cluster:")
- f.write("\nTamaño de cada cluster:\n")
- size=clusters['cluster'].value_counts()
- for num,i in size.iteritems():
- print('%s: %5d (%5.2f%%)' % (num,i,100*i/len(clusters)))
- f.write('%s: %5d (%5.2f%%)\n' % (num,i,100*i/len(clusters)))
- print()
-
- metric_CH = metrics.calinski_harabaz_score(X_normal, cluster_predict)
- metric_SC = metrics.silhouette_score(X_normal, cluster_predict, metric='euclidean',
- sample_size=floor(0.2*len(X)), random_state=seed)
-
-
- names.append(name_algorithm)
- num_cluster.append(len(set(cluster_predict)))
- metrics_CH.append(metric_CH)
- metrics_SC.append(metric_SC)
- tiempos.append(tiempo)
-
- X_cluster = pd.concat([X, clusters], axis=1)
- X_normal_cluster = pd.concat([X_normal, clusters], axis=1)
- name = "caso_" + str(caso) + "_" + name_algorithm
-
- DrawScatterMatrix(data = X_cluster, name = name, display = False, save = True)
-
- DrawHeatmap(data = X_cluster, name = name, display = False, save = True)
-
- meanDF = getMeans(dataFrame = X_cluster)
- print()
- print(meanDF)
- f.write(meanDF.to_string())
-
- if name_algorithm == 'AC':
- DrawDendrogram(data = X_cluster, name = name, display = False, save = True)
- resultados = dataFrameResultados(names, num_cluster, metrics_CH, metrics_SC, tiempos)
- print("\n**************************************\n")
- print(resultados.to_string())
- print("\n**************************************\n")
- f.write("\n**************************************\n")
- f.write(resultados.to_string())
- f.write("\n**************************************\n")
- f.close()
- print("Leyendo el conjunto de datos...")
- censo = pd.read_csv('censo_granada.csv')
- censo = censo.replace(np.NaN,0)
- print("Lectura completada.")
- casado = 2
- hombre = 1
- mujer = 6
- subset = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
- usadas = ['EDAD', 'NPFAM', 'HM5', 'H0515']
- X = subset[usadas]
- X_normal = preprocessing.normalize(X, norm='l2')
- subset_2 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==hombre)]
- usadas_2 = ['EDAD', 'NPFAM', 'HM5', 'H0515']
- X_2 = subset_2[usadas_2]
- X_normal_2 = X_2.apply(norm_to_zero_one)
- subset_3 = censo.loc[(censo['EDAD']>=20) & (censo['EDAD']<=50) & (censo['SEXO']==mujer)]
- usadas_3 = ['EDAD', 'NPFAM', 'NHIJOS', 'ESREAL']
- X_3 = subset_3[usadas_3]
- X_normal_3 = X_3.apply(norm_to_zero_one)
- '''
- correlation = X.corr()
- sns.heatmap(correlation, square = True)
- plt.show()
- '''
- random_seed = 123
- k_means = KMeans(init='k-means++', n_clusters=5, n_init=5, random_state=random_seed)
- agglo=AgglomerativeClustering(n_clusters=5,linkage="ward")
- meanshift = MeanShift(bin_seeding=True)
- miniBatchKMeans = MiniBatchKMeans(init='k-means++',n_clusters=4, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
- dbscan = DBSCAN(eps=0.2)
- dbscan2 = DBSCAN(eps=0.1)
- algorithms = [(k_means, "KMeans"),
- (agglo, "AC"),
- (meanshift, "MeanShift"),
- (miniBatchKMeans, "MiniBatchKM"),
- (dbscan, "DBSCAN")]
- algorithms2 = [(k_means, "KMeans"),
- (agglo, "AC"),
- (meanshift, "MeanShift"),
- (miniBatchKMeans, "MiniBatchKM"),
- (dbscan2, "DBSCAN2")]
- algorithm_kmeans = []
- for i in range(5,9):
- kmeans_i = KMeans(init='k-means++', n_clusters=i, n_init=5)
- algorithm_kmeans.append((kmeans_i, "KMeans_" + str(i)))
- algorithm_AC = []
- for i in range(5,9):
- agglo_i = AgglomerativeClustering(n_clusters=i,linkage="ward")
- algorithm_AC.append((agglo_i, "AC_" + str(i)))
- algorithm_miniBatch = []
- for i in range(5,9):
- miniBatch_i = MiniBatchKMeans(init='k-means++',n_clusters=i, n_init=5, max_no_improvement=10, verbose=0, random_state=random_seed)
- algorithm_miniBatch.append((miniBatch_i, "MiniBatchKM_" + str(i)))
- executeClustering(algorithms, X, 1)
- executeClustering(algorithm_kmeans, X, 1.1)
- executeClustering(algorithm_AC, X, 1.2)
- executeClustering(algorithms, X_2, 2)
- executeClustering(algorithm_kmeans, X_2, 2.1)
- executeClustering(algorithm_miniBatch, X_2, 2.2)
- executeClustering(algorithms2, X_3, 3)
- executeClustering(algorithm_kmeans, X_3, 3.1)
- executeClustering(algorithm_miniBatch, X_3, 3.2)
|