123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- from sklearn.cluster import KMeans
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.metrics import calinski_harabasz_score, fowlkes_mallows_score, silhouette_score
- from sklearn.manifold import TSNE
- from exts import load_alexa, load_dga
- import matplotlib.pyplot as plt
- import numpy as np
- # 提取特征 向量化 以2-gram
- def get_feature(x):
- cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1)
- x = cv.fit_transform(x).toarray()
- return x
- def main():
- aleax = load_alexa('data/domain/top-1000.csv')
- cry = load_dga('data/domain/dga-cryptolocke-1000.txt')
- goz = load_dga('data/domain/dga-post-tovar-goz-1000.txt')
- x = np.concatenate((aleax, cry, goz))
- x = get_feature(x)
- y = np.concatenate(([0] * len(aleax), [1] * len(cry), [1] * len(goz)))
- # 用SVM模型并训练
- kmeans = KMeans(n_clusters=2, random_state=170)
- kmeans.fit(x)
- labels = kmeans.labels_
- # FMI评价法(需要真实值)
- print(fowlkes_mallows_score(y, labels))
- # 轮廓系数法 (不需要真实值)
- print(silhouette_score(x, labels))
- # Calinski-Harabaz Index评估模型(不需要真实值)
- print(calinski_harabasz_score(x, labels))
- # 数据降维与可视化(慢、占内存)
- tsne = TSNE(learning_rate=100)
- x = tsne.fit_transform(x)
- for i, label in enumerate(x):
- x1, x2 = x[i]
- if labels[i] == 1:
- plt.scatter(x1, x2, marker='o')
- else:
- plt.scatter(x1, x2, marker='x')
- plt.show()
- if __name__ == "__main__":
- main()
|