LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
							from exts import *
from nltk import FreqDist
import numpy as np
import pandas as pd
import os
import re
import gzip
import pickle


class Datasets:
    """加载数据集"""

    @staticmethod
    def load_Schonlau(user):
        """加载Masqera Schonlau数据集，导入用户的操作
        加载数据集，100命令为1个序列、共150个。共15000个命令。前5000都正常命令、后10000有恶意命令。

        :param user:  导入哪个用户的操作
        :return:
        """
        with open("data/MasqueradeDat/" + user) as f:
            lines = f.readlines()
        i = 0
        x = []
        all_cmd = []
        data = []
        for line in lines:
            line = line.strip('\n')
            x.append(line)
            all_cmd.append(line)
            i += 1
            if i == 100:
                data.append(x)
                x = []
                i = 0
        fdist = list(FreqDist(all_cmd).keys())

        # 加载labels
        index = int(user[-1]) - 1
        y = []
        with open("data/MasqueradeDat/label.txt") as f:
            for line in f:
                line = line.strip('\n')
                y.append(int(line.split()[index]))
        y = [0] * 50 + y

        return data, y, fdist

    @staticmethod
    def load_kdd99():
        """加载KDD99数据集"""
        with open('data/kddcup99/corrected') as f:
            lines = f.readlines()
        data = []
        for line in lines:
            line = line.strip('\n')
            line = line.split(",")
            data.append(line)

        return data

    @staticmethod
    def load_adfa_normal():
        """加载ADFA-LD 正常数据集"""
        x = []
        y = []
        list = os.listdir('data/ADFA-LD/Training_Data_Master/')
        for i in range(0, len(list)):
            path = os.path.join('data/ADFA-LD/Training_Data_Master/', list[i])
            if os.path.isfile(path):
                x.append(load_one_flle(path))
                y.append(0)

        return x, y

    @staticmethod
    def load_adfa_attack(reg):
        """加载ADFA-LD 攻击数据集

        :param reg: 攻击类型文件 正则
        :return:
        """
        x = []
        y = []
        all_file = dir_list("data/ADFA-LD/Attack_Data_Master/", [])
        for file in all_file:
            if re.match("data/ADFA-LD/Attack_Data_Master/" + reg, file):
                x.append(load_one_flle(file))
                y.append(1)

        return x, y

    @staticmethod
    def load_php_webshell():
        """加载phpwebshell数据集，正常数据用wordpress"""
        webshell = load_files("data/PHP-WEBSHELL/xiaoma/")
        wordpress = load_files("data/wordpress/")

        return webshell, wordpress

    @staticmethod
    def load_dga_domain():
        """加载dga数据集，正常数据区alexa top1000"""
        aleax = load_alexa('data/domain/top-1000.csv')
        cry = load_dga('data/domain/dga-cryptolocke-1000.txt')
        goz = load_dga('data/domain/dga-post-tovar-goz-1000.txt')

        x = np.concatenate((aleax, cry, goz))
        y = np.concatenate(([0] * len(aleax), [1] * len(cry), [2] * len(goz)))

        return x, y

    @staticmethod
    def load_mnist():
        """加载MNIST数据集，MNIST是一个入门级的计算机视觉数据集，它包含各种手写数字图片，也包含每一张图片对应的标签，告诉我们这个是数字几"""
        with gzip.open('data/MNIST/mnist.pkl.gz', "rb") as fp:
            training_data, valid_data, test_data = pickle.load(fp, encoding="bytes")

        return training_data, valid_data, test_data

    @staticmethod
    def load_xss():
        """加载XSS数据集"""
        x1, y1 = load_filename('data/XSS/xss-200000.txt', 1)
        x2, y2 = load_filename('data/XSS/good-200000.txt', 0)

        return x1 + x2, y1 + y2

    @staticmethod
    def load_secrepo():
        """加载secrepo估计数据(ip/域名)"""
        ip_list = {}
        with open("data/etl-ip-domain-train.txt") as f:
            for line in f:
                (ip, domain) = line.split("\t")
                if not ip == "0.0.0.0":
                    if ip not in ip_list:
                        ip_list[ip] = {}

                    ip_list[ip][domain] = 1
        return ip_list

    @staticmethod
    def load_spambase():
        """SpamBase的数据不是原始的邮件内容而是已经特征化的数据，对应的特征是统计的关键字以及特殊符号的词频.
        一共58个属性，其中最后一个是垃圾邮件的标记位"""
        my_name = ["x%s" % i for i in range(1, 58)]
        data = pd.read_csv("data/spambase/spambase.data", header=None, names=my_name)
        x = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        return x, y

    @staticmethod
    def load_movie_review():
        """Movie Review Data数据集包含1000条正面 的评论和1000条负面评论，被广泛应用于文本分类尤其是恶意评论识别方面。"""
        x1, y1 = load_files_lable("data/movie-review-data/review_polarity/txt_sentoken/pos/", 0)
        x2, y2 = load_files_lable("data/movie-review-data/review_polarity/txt_sentoken/neg/", 1)

        return x1 + x2, y1 + y2

    @staticmethod
    def load_us_cities(maxlen):
        """加载us城市

        :param maxlen: 序列的最大长度
        :return:
        """
        path = "data/us_cities/US_Cities.txt"
        file_lines = open(path, "r").read()
        x, y, char_idx = string_to_semi_redundant_sequences(file_lines, seq_maxlen=maxlen, redun_step=3)

        return x, y, char_idx, file_lines

    @staticmethod
    def load_wvs_password(maxlen):
        """加载us城市

        :param maxlen: 序列的最大长度
        :return:
        """
        path = "data/wvs-pass/wvs-pass.txt"
        file_lines = open(path, "r").read()
        x, y, char_idx = string_to_semi_redundant_sequences(file_lines, seq_maxlen=maxlen, redun_step=3)

        return x, y, char_idx, file_lines

    @staticmethod
    def load_enron1():
        """加载垃圾邮件"""

        x1, y1 = load_files_lable("data/enron1/ham/", 0)
        x2, y2 = load_files_lable("data/enron1/spam/", 1)

        return x1 + x2, y1 + y2