5-3-KNN-detect-abnormal-operation.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import numpy as np
  2. from nltk import FreqDist
  3. from sklearn.neighbors import KNeighborsClassifier
  4. from sklearn.model_selection import train_test_split, cross_val_score
  5. import matplotlib.pyplot as plt
  6. from datasets import Datasets
  7. # 特征提取,统计该操作序列中的10个与整个数据最频繁使用的前50个命令以及最不频繁使用的前50个命令计算重合程度
  8. def get_feature(cmd, fdist):
  9. max_cmd = set(fdist[0:50])
  10. min_cmd = set(fdist[-50:])
  11. feature = []
  12. for block in cmd:
  13. f1 = len(set(block))
  14. fdist = list(FreqDist(block).keys())
  15. f2 = fdist[0:10]
  16. f3 = fdist[-10:]
  17. f2 = len(set(f2) & set(max_cmd))
  18. f3 = len(set(f3) & set(min_cmd))
  19. x = [f1, f2, f3]
  20. feature.append(x)
  21. return feature
  22. def main():
  23. data, y, fdist = Datasets.load_Schonlau('User3')
  24. # 特征提取
  25. x = get_feature(data, fdist)
  26. # 训练数据 120 测试数据后30
  27. # x_train, y_train = x[0:100], y[0:100]
  28. # x_test, y_test = x[100:150], y[100:150]
  29. # print(x_test, y_test)
  30. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
  31. # knn训练
  32. # knn = KNeighborsClassifier()
  33. # knn.fit(x_train, y_train)
  34. # # 查看模型分数
  35. # print(knn.score(x_test, y_test))
  36. #
  37. # # 交叉验证 分10组
  38. # scores = cross_val_score(knn, x, y, cv=10, scoring="accuracy")
  39. # print(scores.mean())
  40. # 判断k值
  41. k_range = range(1, 30)
  42. k_scores = []
  43. for k in k_range:
  44. knn = KNeighborsClassifier(n_neighbors=k)
  45. scores = cross_val_score(knn, x, y, cv=10, scoring="accuracy")
  46. k_scores.append(scores.mean())
  47. plt.plot(k_range, k_scores)
  48. plt.xlabel("Value of K for KNN")
  49. plt.ylabel("Cross Validated Accuracy")
  50. plt.show()
  51. # 根据图来看 k=3 模型最优 约96%
  52. if __name__ == "__main__":
  53. main()