10-4-DBSCAN-demo.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import matplotlib.pyplot as plt
  2. import numpy as np
  3. from sklearn.cluster import DBSCAN
  4. from sklearn.datasets import make_blobs
  5. from sklearn.preprocessing import StandardScaler
  6. from sklearn import metrics
  7. def show_dbscan():
  8. # 生成测试样本
  9. centers = [[1, 1], [-1, -1], [1, -1]]
  10. x, y = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0)
  11. x = StandardScaler().fit_transform(x)
  12. # 聚类,半径0.3 最少样本数数10
  13. db = DBSCAN(eps=0.3, min_samples=10).fit(x)
  14. core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  15. core_samples_mask[db.core_sample_indices_] = True
  16. labels = db.labels_
  17. # 查看分为几簇
  18. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  19. print("模型分为:%d簇" % n_clusters_)
  20. # 评估模型
  21. # FMI评价法(需要真实值)
  22. print("FMI评价法: %0.3f" % metrics.fowlkes_mallows_score(y, labels))
  23. # Calinski-Harabaz Index评估模型(不需要真实值)
  24. print("Calinski-Harabaz Index评估法: %0.3f" % metrics.calinski_harabasz_score(x, labels))
  25. # 轮廓系数法 (不需要真实值)
  26. print("轮廓系数法: %0.3f" % metrics.silhouette_score(x, labels))
  27. # 画图 噪音用黑色点
  28. unique_labels = set(labels)
  29. colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
  30. for k, col in zip(unique_labels, colors):
  31. if k == -1:
  32. col = 'k'
  33. class_member_mask = (labels == k)
  34. xy = x[class_member_mask & core_samples_mask]
  35. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
  36. xy = x[class_member_mask & ~core_samples_mask]
  37. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)
  38. plt.title('Estimated number of clusters: %d' % n_clusters_)
  39. plt.show()
  40. if __name__ == '__main__':
  41. show_dbscan()