10-3-Kmeans-discriminate-DGA.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. from sklearn.cluster import KMeans
  2. from sklearn.feature_extraction.text import CountVectorizer
  3. from sklearn.metrics import calinski_harabasz_score, fowlkes_mallows_score, silhouette_score
  4. from sklearn.manifold import TSNE
  5. from exts import load_alexa, load_dga
  6. import matplotlib.pyplot as plt
  7. import numpy as np
  8. # 提取特征 向量化 以2-gram
  9. def get_feature(x):
  10. cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1)
  11. x = cv.fit_transform(x).toarray()
  12. return x
  13. def main():
  14. aleax = load_alexa('data/domain/top-1000.csv')
  15. cry = load_dga('data/domain/dga-cryptolocke-1000.txt')
  16. goz = load_dga('data/domain/dga-post-tovar-goz-1000.txt')
  17. x = np.concatenate((aleax, cry, goz))
  18. x = get_feature(x)
  19. y = np.concatenate(([0] * len(aleax), [1] * len(cry), [1] * len(goz)))
  20. # 用SVM模型并训练
  21. kmeans = KMeans(n_clusters=2, random_state=170)
  22. kmeans.fit(x)
  23. labels = kmeans.labels_
  24. # FMI评价法(需要真实值)
  25. print(fowlkes_mallows_score(y, labels))
  26. # 轮廓系数法 (不需要真实值)
  27. print(silhouette_score(x, labels))
  28. # Calinski-Harabaz Index评估模型(不需要真实值)
  29. print(calinski_harabasz_score(x, labels))
  30. # 数据降维与可视化(慢、占内存)
  31. tsne = TSNE(learning_rate=100)
  32. x = tsne.fit_transform(x)
  33. for i, label in enumerate(x):
  34. x1, x2 = x[i]
  35. if labels[i] == 1:
  36. plt.scatter(x1, x2, marker='o')
  37. else:
  38. plt.scatter(x1, x2, marker='x')
  39. plt.show()
  40. if __name__ == "__main__":
  41. main()