123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196 |
- from exts import *
- from nltk import FreqDist
- import numpy as np
- import pandas as pd
- import os
- import re
- import gzip
- import pickle
- class Datasets:
- """加载数据集"""
- @staticmethod
- def load_Schonlau(user):
- """加载Masqera Schonlau数据集,导入用户的操作
- 加载数据集,100命令为1个序列、共150个。共15000个命令。前5000都正常命令、后10000有恶意命令。
- :param user: 导入哪个用户的操作
- :return:
- """
- with open("data/MasqueradeDat/" + user) as f:
- lines = f.readlines()
- i = 0
- x = []
- all_cmd = []
- data = []
- for line in lines:
- line = line.strip('\n')
- x.append(line)
- all_cmd.append(line)
- i += 1
- if i == 100:
- data.append(x)
- x = []
- i = 0
- fdist = list(FreqDist(all_cmd).keys())
- # 加载labels
- index = int(user[-1]) - 1
- y = []
- with open("data/MasqueradeDat/label.txt") as f:
- for line in f:
- line = line.strip('\n')
- y.append(int(line.split()[index]))
- y = [0] * 50 + y
- return data, y, fdist
- @staticmethod
- def load_kdd99():
- """加载KDD99数据集"""
- with open('data/kddcup99/corrected') as f:
- lines = f.readlines()
- data = []
- for line in lines:
- line = line.strip('\n')
- line = line.split(",")
- data.append(line)
- return data
- @staticmethod
- def load_adfa_normal():
- """加载ADFA-LD 正常数据集"""
- x = []
- y = []
- list = os.listdir('data/ADFA-LD/Training_Data_Master/')
- for i in range(0, len(list)):
- path = os.path.join('data/ADFA-LD/Training_Data_Master/', list[i])
- if os.path.isfile(path):
- x.append(load_one_flle(path))
- y.append(0)
- return x, y
- @staticmethod
- def load_adfa_attack(reg):
- """加载ADFA-LD 攻击数据集
- :param reg: 攻击类型文件 正则
- :return:
- """
- x = []
- y = []
- all_file = dir_list("data/ADFA-LD/Attack_Data_Master/", [])
- for file in all_file:
- if re.match("data/ADFA-LD/Attack_Data_Master/" + reg, file):
- x.append(load_one_flle(file))
- y.append(1)
- return x, y
- @staticmethod
- def load_php_webshell():
- """加载phpwebshell数据集,正常数据用wordpress"""
- webshell = load_files("data/PHP-WEBSHELL/xiaoma/")
- wordpress = load_files("data/wordpress/")
- return webshell, wordpress
- @staticmethod
- def load_dga_domain():
- """加载dga数据集,正常数据区alexa top1000"""
- aleax = load_alexa('data/domain/top-1000.csv')
- cry = load_dga('data/domain/dga-cryptolocke-1000.txt')
- goz = load_dga('data/domain/dga-post-tovar-goz-1000.txt')
- x = np.concatenate((aleax, cry, goz))
- y = np.concatenate(([0] * len(aleax), [1] * len(cry), [2] * len(goz)))
- return x, y
- @staticmethod
- def load_mnist():
- """加载MNIST数据集,MNIST是一个入门级的计算机视觉数据集,它包含各种手写数字图片,也包含每一张图片对应的标签,告诉我们这个是数字几"""
- with gzip.open('data/MNIST/mnist.pkl.gz', "rb") as fp:
- training_data, valid_data, test_data = pickle.load(fp, encoding="bytes")
- return training_data, valid_data, test_data
- @staticmethod
- def load_xss():
- """加载XSS数据集"""
- x1, y1 = load_filename('data/XSS/xss-200000.txt', 1)
- x2, y2 = load_filename('data/XSS/good-200000.txt', 0)
- return x1 + x2, y1 + y2
- @staticmethod
- def load_secrepo():
- """加载secrepo估计数据(ip/域名)"""
- ip_list = {}
- with open("data/etl-ip-domain-train.txt") as f:
- for line in f:
- (ip, domain) = line.split("\t")
- if not ip == "0.0.0.0":
- if ip not in ip_list:
- ip_list[ip] = {}
- ip_list[ip][domain] = 1
- return ip_list
- @staticmethod
- def load_spambase():
- """SpamBase的数据不是原始的邮件内容而是已经特征化的数据,对应的特征是统计的关键字以及特殊符号的词频.
- 一共58个属性,其中最后一个是垃圾邮件的标记位"""
- my_name = ["x%s" % i for i in range(1, 58)]
- data = pd.read_csv("data/spambase/spambase.data", header=None, names=my_name)
- x = data.iloc[:, :-1]
- y = data.iloc[:, -1]
- return x, y
- @staticmethod
- def load_movie_review():
- """Movie Review Data数据集包含1000条正面 的评论和1000条负面评论,被广泛应用于文本分类尤其是恶意评论识别方面。"""
- x1, y1 = load_files_lable("data/movie-review-data/review_polarity/txt_sentoken/pos/", 0)
- x2, y2 = load_files_lable("data/movie-review-data/review_polarity/txt_sentoken/neg/", 1)
- return x1 + x2, y1 + y2
- @staticmethod
- def load_us_cities(maxlen):
- """加载us城市
- :param maxlen: 序列的最大长度
- :return:
- """
- path = "data/us_cities/US_Cities.txt"
- file_lines = open(path, "r").read()
- x, y, char_idx = string_to_semi_redundant_sequences(file_lines, seq_maxlen=maxlen, redun_step=3)
- return x, y, char_idx, file_lines
- @staticmethod
- def load_wvs_password(maxlen):
- """加载us城市
- :param maxlen: 序列的最大长度
- :return:
- """
- path = "data/wvs-pass/wvs-pass.txt"
- file_lines = open(path, "r").read()
- x, y, char_idx = string_to_semi_redundant_sequences(file_lines, seq_maxlen=maxlen, redun_step=3)
- return x, y, char_idx, file_lines
- @staticmethod
- def load_enron1():
- """加载垃圾邮件"""
- x1, y1 = load_files_lable("data/enron1/ham/", 0)
- x2, y2 = load_files_lable("data/enron1/spam/", 1)
- return x1 + x2, y1 + y2
|