12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import matplotlib.pyplot as plt
- import numpy as np
- from sklearn.cluster import DBSCAN
- from sklearn.datasets import make_blobs
- from sklearn.preprocessing import StandardScaler
- from sklearn import metrics
- def show_dbscan():
- # 生成测试样本
- centers = [[1, 1], [-1, -1], [1, -1]]
- x, y = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0)
- x = StandardScaler().fit_transform(x)
- # 聚类,半径0.3 最少样本数数10
- db = DBSCAN(eps=0.3, min_samples=10).fit(x)
- core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
- core_samples_mask[db.core_sample_indices_] = True
- labels = db.labels_
- # 查看分为几簇
- n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
- print("模型分为:%d簇" % n_clusters_)
- # 评估模型
- # FMI评价法(需要真实值)
- print("FMI评价法: %0.3f" % metrics.fowlkes_mallows_score(y, labels))
- # Calinski-Harabaz Index评估模型(不需要真实值)
- print("Calinski-Harabaz Index评估法: %0.3f" % metrics.calinski_harabasz_score(x, labels))
- # 轮廓系数法 (不需要真实值)
- print("轮廓系数法: %0.3f" % metrics.silhouette_score(x, labels))
- # 画图 噪音用黑色点
- unique_labels = set(labels)
- colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
- for k, col in zip(unique_labels, colors):
- if k == -1:
- col = 'k'
- class_member_mask = (labels == k)
- xy = x[class_member_mask & core_samples_mask]
- plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
- xy = x[class_member_mask & ~core_samples_mask]
- plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)
- plt.title('Estimated number of clusters: %d' % n_clusters_)
- plt.show()
- if __name__ == '__main__':
- show_dbscan()
|