9-4-SVM-discriminate-DGA.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. from sklearn.svm import SVC
  2. from datasets import Datasets
  3. from sklearn.preprocessing import StandardScaler
  4. from sklearn.model_selection import train_test_split, cross_val_score
  5. from exts import load_alexa, load_dga
  6. import matplotlib.pyplot as plt
  7. import numpy as np
  8. import re
  9. # 特征选取: 元音字母的比例、去重后的字母数字个数与域名长度的比例、平均jarccard系数(交集与并集的个数)、HMM系数(隐马尔可夫模型)
  10. # 元音字母的比例 x:域名长度 y:元音字母的比例
  11. def aeiou_count(domain_list):
  12. x = []
  13. y = []
  14. for domain in domain_list:
  15. x.append(len(domain))
  16. count = len(re.findall(r'[aeiou]', domain.lower()))
  17. count = (0.0 + count) / len(domain)
  18. y.append(count)
  19. return x, y
  20. # 去重后的字母数字个数与域名长度的比例
  21. def get_uniq_char_num(domain_list):
  22. x = []
  23. y = []
  24. for domain in domain_list:
  25. x.append(len(domain))
  26. count = len(set(domain))
  27. count = (0.0 + count) / len(domain)
  28. y.append(count)
  29. return x, y
  30. # 计算两个域名之间的jarccard系数
  31. def count2string_jarccard_index(a, b):
  32. x = set(' ' + a[0])
  33. y = set(' ' + b[0])
  34. for i in range(0, len(a) - 1):
  35. x.add(a[i] + a[i + 1])
  36. x.add(a[len(a) - 1] + ' ')
  37. for i in range(0, len(b) - 1):
  38. y.add(b[i] + b[i + 1])
  39. y.add(b[len(b) - 1] + ' ')
  40. return (0.0 + len(x - y)) / len(x | y)
  41. # 计算两个域名集合的平均jarccard系数
  42. def get_jarccard_index(a_list, b_list):
  43. x = []
  44. y = []
  45. for a in a_list:
  46. j = 0.0
  47. for b in b_list:
  48. j += count2string_jarccard_index(a, b)
  49. x.append(len(a))
  50. y.append(j / len(b_list))
  51. return x, y
  52. # 平均jarccard系数
  53. def jarccard_mean(domain_list):
  54. x, y = get_jarccard_index(domain_list, aleax)
  55. return x, y
  56. # 根据特征函数画图
  57. def dradrawing(my_feature):
  58. x1, y1 = my_feature(aleax)
  59. x2, y2 = my_feature(cry)
  60. x3, y3 = my_feature(goz)
  61. # 画图
  62. fig, ax = plt.subplots()
  63. ax.set_xlabel('Domain Length')
  64. ax.set_ylabel('Score')
  65. ax.scatter(x3, y3, color='b', label="dga_post-tovar-goz", marker='o')
  66. ax.scatter(x2, y2, color='g', label="dga_cryptolock", marker='v')
  67. ax.scatter(x1, y1, color='r', label="alexa", marker='*')
  68. ax.legend(loc='best')
  69. plt.show()
  70. # 特征提取
  71. def get_feature(domain_list):
  72. x = []
  73. _, x1 = aeiou_count(domain_list)
  74. _, x2 = get_uniq_char_num(domain_list)
  75. _, x3 = jarccard_mean(domain_list)
  76. for i in range(0, len(x1)):
  77. x.append([x1[i], x2[i], x3[i]])
  78. return x
  79. def main():
  80. # 画图
  81. # dradrawing(aeiou_count)
  82. # dradrawing(get_uniq_char_num)
  83. # dradrawing(jarccard_mean)
  84. # 特征提取
  85. x = np.concatenate((get_feature(aleax), get_feature(cry), get_feature(goz)))
  86. y = np.concatenate(([0] * len(aleax), [1] * len(cry), [2] * len(goz)))
  87. # 标准化
  88. std = StandardScaler()
  89. x = std.fit_transform(x)
  90. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)
  91. # 用SVM模型并训练
  92. clf = SVC(kernel='linear')
  93. clf.fit(x_train, y_train)
  94. print(clf.score(x_test, y_test))
  95. if __name__ == "__main__":
  96. aleax = load_alexa('data/domain/top-1000.csv')
  97. cry = load_dga('data/domain/dga-cryptolocke-1000.txt')
  98. goz = load_dga('data/domain/dga-post-tovar-goz-1000.txt')
  99. main()