datasets.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. from exts import *
  2. from nltk import FreqDist
  3. import numpy as np
  4. import pandas as pd
  5. import os
  6. import re
  7. import gzip
  8. import pickle
  9. class Datasets:
  10. """加载数据集"""
  11. @staticmethod
  12. def load_Schonlau(user):
  13. """加载Masqera Schonlau数据集,导入用户的操作
  14. 加载数据集,100命令为1个序列、共150个。共15000个命令。前5000都正常命令、后10000有恶意命令。
  15. :param user: 导入哪个用户的操作
  16. :return:
  17. """
  18. with open("data/MasqueradeDat/" + user) as f:
  19. lines = f.readlines()
  20. i = 0
  21. x = []
  22. all_cmd = []
  23. data = []
  24. for line in lines:
  25. line = line.strip('\n')
  26. x.append(line)
  27. all_cmd.append(line)
  28. i += 1
  29. if i == 100:
  30. data.append(x)
  31. x = []
  32. i = 0
  33. fdist = list(FreqDist(all_cmd).keys())
  34. # 加载labels
  35. index = int(user[-1]) - 1
  36. y = []
  37. with open("data/MasqueradeDat/label.txt") as f:
  38. for line in f:
  39. line = line.strip('\n')
  40. y.append(int(line.split()[index]))
  41. y = [0] * 50 + y
  42. return data, y, fdist
  43. @staticmethod
  44. def load_kdd99():
  45. """加载KDD99数据集"""
  46. with open('data/kddcup99/corrected') as f:
  47. lines = f.readlines()
  48. data = []
  49. for line in lines:
  50. line = line.strip('\n')
  51. line = line.split(",")
  52. data.append(line)
  53. return data
  54. @staticmethod
  55. def load_adfa_normal():
  56. """加载ADFA-LD 正常数据集"""
  57. x = []
  58. y = []
  59. list = os.listdir('data/ADFA-LD/Training_Data_Master/')
  60. for i in range(0, len(list)):
  61. path = os.path.join('data/ADFA-LD/Training_Data_Master/', list[i])
  62. if os.path.isfile(path):
  63. x.append(load_one_flle(path))
  64. y.append(0)
  65. return x, y
  66. @staticmethod
  67. def load_adfa_attack(reg):
  68. """加载ADFA-LD 攻击数据集
  69. :param reg: 攻击类型文件 正则
  70. :return:
  71. """
  72. x = []
  73. y = []
  74. all_file = dir_list("data/ADFA-LD/Attack_Data_Master/", [])
  75. for file in all_file:
  76. if re.match("data/ADFA-LD/Attack_Data_Master/" + reg, file):
  77. x.append(load_one_flle(file))
  78. y.append(1)
  79. return x, y
  80. @staticmethod
  81. def load_php_webshell():
  82. """加载phpwebshell数据集,正常数据用wordpress"""
  83. webshell = load_files("data/PHP-WEBSHELL/xiaoma/")
  84. wordpress = load_files("data/wordpress/")
  85. return webshell, wordpress
  86. @staticmethod
  87. def load_dga_domain():
  88. """加载dga数据集,正常数据区alexa top1000"""
  89. aleax = load_alexa('data/domain/top-1000.csv')
  90. cry = load_dga('data/domain/dga-cryptolocke-1000.txt')
  91. goz = load_dga('data/domain/dga-post-tovar-goz-1000.txt')
  92. x = np.concatenate((aleax, cry, goz))
  93. y = np.concatenate(([0] * len(aleax), [1] * len(cry), [2] * len(goz)))
  94. return x, y
  95. @staticmethod
  96. def load_mnist():
  97. """加载MNIST数据集,MNIST是一个入门级的计算机视觉数据集,它包含各种手写数字图片,也包含每一张图片对应的标签,告诉我们这个是数字几"""
  98. with gzip.open('data/MNIST/mnist.pkl.gz', "rb") as fp:
  99. training_data, valid_data, test_data = pickle.load(fp, encoding="bytes")
  100. return training_data, valid_data, test_data
  101. @staticmethod
  102. def load_xss():
  103. """加载XSS数据集"""
  104. x1, y1 = load_filename('data/XSS/xss-200000.txt', 1)
  105. x2, y2 = load_filename('data/XSS/good-200000.txt', 0)
  106. return x1 + x2, y1 + y2
  107. @staticmethod
  108. def load_secrepo():
  109. """加载secrepo估计数据(ip/域名)"""
  110. ip_list = {}
  111. with open("data/etl-ip-domain-train.txt") as f:
  112. for line in f:
  113. (ip, domain) = line.split("\t")
  114. if not ip == "0.0.0.0":
  115. if ip not in ip_list:
  116. ip_list[ip] = {}
  117. ip_list[ip][domain] = 1
  118. return ip_list
  119. @staticmethod
  120. def load_spambase():
  121. """SpamBase的数据不是原始的邮件内容而是已经特征化的数据,对应的特征是统计的关键字以及特殊符号的词频.
  122. 一共58个属性,其中最后一个是垃圾邮件的标记位"""
  123. my_name = ["x%s" % i for i in range(1, 58)]
  124. data = pd.read_csv("data/spambase/spambase.data", header=None, names=my_name)
  125. x = data.iloc[:, :-1]
  126. y = data.iloc[:, -1]
  127. return x, y
  128. @staticmethod
  129. def load_movie_review():
  130. """Movie Review Data数据集包含1000条正面 的评论和1000条负面评论,被广泛应用于文本分类尤其是恶意评论识别方面。"""
  131. x1, y1 = load_files_lable("data/movie-review-data/review_polarity/txt_sentoken/pos/", 0)
  132. x2, y2 = load_files_lable("data/movie-review-data/review_polarity/txt_sentoken/neg/", 1)
  133. return x1 + x2, y1 + y2
  134. @staticmethod
  135. def load_us_cities(maxlen):
  136. """加载us城市
  137. :param maxlen: 序列的最大长度
  138. :return:
  139. """
  140. path = "data/us_cities/US_Cities.txt"
  141. file_lines = open(path, "r").read()
  142. x, y, char_idx = string_to_semi_redundant_sequences(file_lines, seq_maxlen=maxlen, redun_step=3)
  143. return x, y, char_idx, file_lines
  144. @staticmethod
  145. def load_wvs_password(maxlen):
  146. """加载us城市
  147. :param maxlen: 序列的最大长度
  148. :return:
  149. """
  150. path = "data/wvs-pass/wvs-pass.txt"
  151. file_lines = open(path, "r").read()
  152. x, y, char_idx = string_to_semi_redundant_sequences(file_lines, seq_maxlen=maxlen, redun_step=3)
  153. return x, y, char_idx, file_lines
  154. @staticmethod
  155. def load_enron1():
  156. """加载垃圾邮件"""
  157. x1, y1 = load_files_lable("data/enron1/ham/", 0)
  158. x2, y2 = load_files_lable("data/enron1/spam/", 1)
  159. return x1 + x2, y1 + y2